diff options
Diffstat (limited to 'drivers/gpu/drm/xe')
330 files changed, 59579 insertions, 0 deletions
diff --git a/drivers/gpu/drm/xe/.gitignore b/drivers/gpu/drm/xe/.gitignore new file mode 100644 index 000000000000..8778bf132674 --- /dev/null +++ b/drivers/gpu/drm/xe/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +*.hdrtest +/generated +/xe_gen_wa_oob diff --git a/drivers/gpu/drm/xe/.kunitconfig b/drivers/gpu/drm/xe/.kunitconfig new file mode 100644 index 000000000000..9590eac91af3 --- /dev/null +++ b/drivers/gpu/drm/xe/.kunitconfig @@ -0,0 +1,13 @@ +# xe dependencies +CONFIG_KUNIT=y +CONFIG_PCI=y +CONFIG_PCI_IOV=y +CONFIG_DEBUG_FS=y +CONFIG_DRM=y +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_KMS_HELPER=y +CONFIG_DRM_XE=y +CONFIG_DRM_XE_DISPLAY=n +CONFIG_EXPERT=y +CONFIG_FB=y +CONFIG_DRM_XE_KUNIT_TEST=y diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig new file mode 100644 index 000000000000..1cced50d8d8c --- /dev/null +++ b/drivers/gpu/drm/xe/Kconfig @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: GPL-2.0-only +config DRM_XE + tristate "Intel Xe Graphics" + depends on DRM && PCI && MMU && (m || (y && KUNIT=y)) && 64BIT + select INTERVAL_TREE + # we need shmfs for the swappable backing store, and in particular + # the shmem_readpage() which depends upon tmpfs + select SHMEM + select TMPFS + select DRM_BUDDY + select DRM_EXEC + select DRM_KMS_HELPER + select DRM_PANEL + select DRM_SUBALLOC_HELPER + select DRM_DISPLAY_DP_HELPER + select DRM_DISPLAY_HDCP_HELPER + select DRM_DISPLAY_HDMI_HELPER + select DRM_DISPLAY_HELPER + select DRM_MIPI_DSI + select RELAY + select IRQ_WORK + # xe depends on ACPI_VIDEO when ACPI is enabled + # but for select to work, need to select ACPI_VIDEO's dependencies, ick + select BACKLIGHT_CLASS_DEVICE if ACPI + select INPUT if ACPI + select ACPI_VIDEO if X86 && ACPI + select ACPI_BUTTON if ACPI + select ACPI_WMI if X86 && ACPI + select SYNC_FILE + select IOSF_MBI + select CRC32 + select SND_HDA_I915 if SND_HDA_CORE + select CEC_CORE if CEC_NOTIFIER + select VMAP_PFN + select DRM_TTM + select DRM_TTM_HELPER + select DRM_EXEC + select DRM_GPUVM + select DRM_SCHED + select MMU_NOTIFIER + select WANT_DEV_COREDUMP + select AUXILIARY_BUS + help + Experimental driver for Intel Xe series GPUs + + If "M" is selected, the module will be called xe. + +config DRM_XE_DISPLAY + bool "Enable display support" + depends on DRM_XE && EXPERT && DRM_XE=m + select FB_IOMEM_HELPERS + select I2C + select I2C_ALGOBIT + default y + help + Disable this option only if you want to compile out display support. + +config DRM_XE_FORCE_PROBE + string "Force probe xe for selected Intel hardware IDs" + depends on DRM_XE + help + This is the default value for the xe.force_probe module + parameter. Using the module parameter overrides this option. + + Force probe the xe for Intel graphics devices that are + recognized but not properly supported by this kernel version. It is + recommended to upgrade to a kernel version with proper support as soon + as it is available. + + It can also be used to block the probe of recognized and fully + supported devices. + + Use "" to disable force probe. If in doubt, use this. + + Use "<pci-id>[,<pci-id>,...]" to force probe the xe for listed + devices. For example, "4500" or "4500,4571". + + Use "*" to force probe the driver for all known devices. + + Use "!" right before the ID to block the probe of the device. For + example, "4500,!4571" forces the probe of 4500 and blocks the probe of + 4571. + + Use "!*" to block the probe of the driver for all known devices. + +menu "drm/Xe Debugging" +depends on DRM_XE +depends on EXPERT +source "drivers/gpu/drm/xe/Kconfig.debug" +endmenu + +menu "drm/xe Profile Guided Optimisation" + visible if EXPERT + depends on DRM_XE + source "drivers/gpu/drm/xe/Kconfig.profile" +endmenu diff --git a/drivers/gpu/drm/xe/Kconfig.debug b/drivers/gpu/drm/xe/Kconfig.debug new file mode 100644 index 000000000000..549065f57a78 --- /dev/null +++ b/drivers/gpu/drm/xe/Kconfig.debug @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: GPL-2.0-only +config DRM_XE_WERROR + bool "Force GCC to throw an error instead of a warning when compiling" + # As this may inadvertently break the build, only allow the user + # to shoot oneself in the foot iff they aim really hard + depends on EXPERT + # We use the dependency on !COMPILE_TEST to not be enabled in + # allmodconfig or allyesconfig configurations + depends on !COMPILE_TEST + default n + help + Add -Werror to the build flags for (and only for) xe.ko. + Do not enable this unless you are writing code for the xe.ko module. + + Recommended for driver developers only. + + If in doubt, say "N". + +config DRM_XE_DEBUG + bool "Enable additional driver debugging" + depends on DRM_XE + depends on EXPERT + depends on !COMPILE_TEST + default n + help + Choose this option to turn on extra driver debugging that may affect + performance but will catch some internal issues. + + Recommended for driver developers only. + + If in doubt, say "N". + +config DRM_XE_DEBUG_VM + bool "Enable extra VM debugging info" + default n + help + Enable extra VM debugging info + + Recommended for driver developers only. + + If in doubt, say "N". + +config DRM_XE_DEBUG_SRIOV + bool "Enable extra SR-IOV debugging" + default n + help + Enable extra SR-IOV debugging info. + + Recommended for driver developers only. + + If in doubt, say "N". + +config DRM_XE_DEBUG_MEM + bool "Enable passing SYS/VRAM addresses to user space" + default n + help + Pass object location trough uapi. Intended for extended + testing and development only. + + Recommended for driver developers only. + + If in doubt, say "N". + +config DRM_XE_SIMPLE_ERROR_CAPTURE + bool "Enable simple error capture to dmesg on job timeout" + default n + help + Choose this option when debugging an unexpected job timeout + + Recommended for driver developers only. + + If in doubt, say "N". + +config DRM_XE_KUNIT_TEST + tristate "KUnit tests for the drm xe driver" if !KUNIT_ALL_TESTS + depends on DRM_XE && KUNIT && DEBUG_FS + default KUNIT_ALL_TESTS + select DRM_EXPORT_FOR_TESTS if m + select DRM_KUNIT_TEST_HELPERS + help + Choose this option to allow the driver to perform selftests under + the kunit framework + + Recommended for driver developers only. + + If in doubt, say "N". + +config DRM_XE_LARGE_GUC_BUFFER + bool "Enable larger guc log buffer" + default n + help + Choose this option when debugging guc issues. + Buffer should be large enough for complex issues. + + Recommended for driver developers only. + + If in doubt, say "N". + +config DRM_XE_USERPTR_INVAL_INJECT + bool "Inject userptr invalidation -EINVAL errors" + default n + help + Choose this option when debugging error paths that + are hit during checks for userptr invalidations. + + Recomended for driver developers only. + If in doubt, say "N". diff --git a/drivers/gpu/drm/xe/Kconfig.profile b/drivers/gpu/drm/xe/Kconfig.profile new file mode 100644 index 000000000000..ba17a25e8db3 --- /dev/null +++ b/drivers/gpu/drm/xe/Kconfig.profile @@ -0,0 +1,54 @@ +config DRM_XE_JOB_TIMEOUT_MAX + int "Default max job timeout (ms)" + default 10000 # milliseconds + help + Configures the default max job timeout after which job will + be forcefully taken away from scheduler. +config DRM_XE_JOB_TIMEOUT_MIN + int "Default min job timeout (ms)" + default 1 # milliseconds + help + Configures the default min job timeout after which job will + be forcefully taken away from scheduler. +config DRM_XE_TIMESLICE_MAX + int "Default max timeslice duration (us)" + default 10000000 # microseconds + help + Configures the default max timeslice duration between multiple + contexts by guc scheduling. +config DRM_XE_TIMESLICE_MIN + int "Default min timeslice duration (us)" + default 1 # microseconds + help + Configures the default min timeslice duration between multiple + contexts by guc scheduling. +config DRM_XE_PREEMPT_TIMEOUT + int "Preempt timeout (us, jiffy granularity)" + default 640000 # microseconds + help + How long to wait (in microseconds) for a preemption event to occur + when submitting a new context. If the current context does not hit + an arbitration point and yield to HW before the timer expires, the + HW will be reset to allow the more important context to execute. +config DRM_XE_PREEMPT_TIMEOUT_MAX + int "Default max preempt timeout (us)" + default 10000000 # microseconds + help + Configures the default max preempt timeout after which context + will be forcefully taken away and higher priority context will + run. +config DRM_XE_PREEMPT_TIMEOUT_MIN + int "Default min preempt timeout (us)" + default 1 # microseconds + help + Configures the default min preempt timeout after which context + will be forcefully taken away and higher priority context will + run. +config DRM_XE_ENABLE_SCHEDTIMEOUT_LIMIT + bool "Default configuration of limitation on scheduler timeout" + default y + help + Configures the enablement of limitation on scheduler timeout + to apply to applicable user. For elevated user, all above MIN + and MAX values will apply when this configuration is enable to + apply limitation. By default limitation is applied. diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile new file mode 100644 index 000000000000..53bd2a8ba1ae --- /dev/null +++ b/drivers/gpu/drm/xe/Makefile @@ -0,0 +1,305 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the drm device driver. This driver provides support for the +# Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher. + +# Unconditionally enable W=1 warnings locally +# --- begin copy-paste W=1 warnings from scripts/Makefile.extrawarn +subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter +subdir-ccflags-y += -Wmissing-declarations +subdir-ccflags-y += $(call cc-option, -Wrestrict) +subdir-ccflags-y += -Wmissing-format-attribute +subdir-ccflags-y += -Wmissing-prototypes +subdir-ccflags-y += -Wold-style-definition +subdir-ccflags-y += -Wmissing-include-dirs +subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable) +subdir-ccflags-y += $(call cc-option, -Wunused-const-variable) +subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned) +subdir-ccflags-y += $(call cc-option, -Wformat-overflow) +subdir-ccflags-y += $(call cc-option, -Wformat-truncation) +subdir-ccflags-y += $(call cc-option, -Wstringop-overflow) +subdir-ccflags-y += $(call cc-option, -Wstringop-truncation) +# The following turn off the warnings enabled by -Wextra +ifeq ($(findstring 2, $(KBUILD_EXTRA_WARN)),) +subdir-ccflags-y += -Wno-missing-field-initializers +subdir-ccflags-y += -Wno-type-limits +subdir-ccflags-y += -Wno-shift-negative-value +endif +ifeq ($(findstring 3, $(KBUILD_EXTRA_WARN)),) +subdir-ccflags-y += -Wno-sign-compare +endif +# --- end copy-paste + +# Enable -Werror in CI and development +subdir-ccflags-$(CONFIG_DRM_XE_WERROR) += -Werror + +subdir-ccflags-y += -I$(obj) -I$(srctree)/$(src) + +# generated sources +hostprogs := xe_gen_wa_oob + +generated_oob := $(obj)/generated/xe_wa_oob.c $(obj)/generated/xe_wa_oob.h + +quiet_cmd_wa_oob = GEN $(notdir $(generated_oob)) + cmd_wa_oob = mkdir -p $(@D); $^ $(generated_oob) + +$(generated_oob) &: $(obj)/xe_gen_wa_oob $(srctree)/$(src)/xe_wa_oob.rules + $(call cmd,wa_oob) + +uses_generated_oob := \ + $(obj)/xe_gsc.o \ + $(obj)/xe_guc.o \ + $(obj)/xe_migrate.o \ + $(obj)/xe_ring_ops.o \ + $(obj)/xe_vm.o \ + $(obj)/xe_wa.o \ + $(obj)/xe_ttm_stolen_mgr.o + +$(uses_generated_oob): $(generated_oob) + +# Please keep these build lists sorted! + +# core driver code + +xe-y += xe_bb.o \ + xe_bo.o \ + xe_bo_evict.o \ + xe_debugfs.o \ + xe_devcoredump.o \ + xe_device.o \ + xe_device_sysfs.o \ + xe_dma_buf.o \ + xe_drm_client.o \ + xe_exec.o \ + xe_execlist.o \ + xe_exec_queue.o \ + xe_force_wake.o \ + xe_ggtt.o \ + xe_gpu_scheduler.o \ + xe_gsc.o \ + xe_gsc_submit.o \ + xe_gt.o \ + xe_gt_ccs_mode.o \ + xe_gt_clock.o \ + xe_gt_debugfs.o \ + xe_gt_freq.o \ + xe_gt_idle.o \ + xe_gt_mcr.o \ + xe_gt_pagefault.o \ + xe_gt_sysfs.o \ + xe_gt_throttle_sysfs.o \ + xe_gt_tlb_invalidation.o \ + xe_gt_topology.o \ + xe_guc.o \ + xe_guc_ads.o \ + xe_guc_ct.o \ + xe_guc_debugfs.o \ + xe_guc_hwconfig.o \ + xe_guc_log.o \ + xe_guc_pc.o \ + xe_guc_submit.o \ + xe_heci_gsc.o \ + xe_hw_engine.o \ + xe_hw_engine_class_sysfs.o \ + xe_hw_fence.o \ + xe_huc.o \ + xe_huc_debugfs.o \ + xe_irq.o \ + xe_lrc.o \ + xe_migrate.o \ + xe_mmio.o \ + xe_mocs.o \ + xe_module.o \ + xe_pat.o \ + xe_pci.o \ + xe_pcode.o \ + xe_pm.o \ + xe_preempt_fence.o \ + xe_pt.o \ + xe_pt_walk.o \ + xe_query.o \ + xe_range_fence.o \ + xe_reg_sr.o \ + xe_reg_whitelist.o \ + xe_rtp.o \ + xe_ring_ops.o \ + xe_sa.o \ + xe_sched_job.o \ + xe_step.o \ + xe_sync.o \ + xe_tile.o \ + xe_tile_sysfs.o \ + xe_trace.o \ + xe_ttm_sys_mgr.o \ + xe_ttm_stolen_mgr.o \ + xe_ttm_vram_mgr.o \ + xe_tuning.o \ + xe_uc.o \ + xe_uc_debugfs.o \ + xe_uc_fw.o \ + xe_vm.o \ + xe_wait_user_fence.o \ + xe_wa.o \ + xe_wopcm.o + +# graphics hardware monitoring (HWMON) support +xe-$(CONFIG_HWMON) += xe_hwmon.o + +# graphics virtualization (SR-IOV) support +xe-y += xe_sriov.o + +xe-$(CONFIG_PCI_IOV) += \ + xe_lmtt.o \ + xe_lmtt_2l.o \ + xe_lmtt_ml.o + +# i915 Display compat #defines and #includes +subdir-ccflags-$(CONFIG_DRM_XE_DISPLAY) += \ + -I$(srctree)/$(src)/display/ext \ + -I$(srctree)/$(src)/compat-i915-headers \ + -I$(srctree)/drivers/gpu/drm/xe/display/ \ + -I$(srctree)/drivers/gpu/drm/i915/display/ \ + -Ddrm_i915_gem_object=xe_bo \ + -Ddrm_i915_private=xe_device + +CFLAGS_i915-display/intel_fbdev.o = $(call cc-disable-warning, override-init) +CFLAGS_i915-display/intel_display_device.o = $(call cc-disable-warning, override-init) + +# Rule to build SOC code shared with i915 +$(obj)/i915-soc/%.o: $(srctree)/drivers/gpu/drm/i915/soc/%.c FORCE + $(call cmd,force_checksrc) + $(call if_changed_rule,cc_o_c) + +# Rule to build display code shared with i915 +$(obj)/i915-display/%.o: $(srctree)/drivers/gpu/drm/i915/display/%.c FORCE + $(call cmd,force_checksrc) + $(call if_changed_rule,cc_o_c) + +# Display code specific to xe +xe-$(CONFIG_DRM_XE_DISPLAY) += \ + xe_display.o \ + display/xe_fb_pin.o \ + display/xe_hdcp_gsc.o \ + display/xe_plane_initial.o \ + display/xe_display_rps.o \ + display/xe_display_misc.o \ + display/xe_dsb_buffer.o \ + display/intel_fbdev_fb.o \ + display/intel_fb_bo.o \ + display/ext/i915_irq.o \ + display/ext/i915_utils.o + +# SOC code shared with i915 +xe-$(CONFIG_DRM_XE_DISPLAY) += \ + i915-soc/intel_dram.o \ + i915-soc/intel_pch.o + +# Display code shared with i915 +xe-$(CONFIG_DRM_XE_DISPLAY) += \ + i915-display/icl_dsi.o \ + i915-display/intel_atomic.o \ + i915-display/intel_atomic_plane.o \ + i915-display/intel_audio.o \ + i915-display/intel_backlight.o \ + i915-display/intel_bios.o \ + i915-display/intel_bw.o \ + i915-display/intel_cdclk.o \ + i915-display/intel_color.o \ + i915-display/intel_combo_phy.o \ + i915-display/intel_connector.o \ + i915-display/intel_crtc.o \ + i915-display/intel_crtc_state_dump.o \ + i915-display/intel_cursor.o \ + i915-display/intel_cx0_phy.o \ + i915-display/intel_ddi.o \ + i915-display/intel_ddi_buf_trans.o \ + i915-display/intel_display.o \ + i915-display/intel_display_debugfs.o \ + i915-display/intel_display_debugfs_params.o \ + i915-display/intel_display_device.o \ + i915-display/intel_display_driver.o \ + i915-display/intel_display_irq.o \ + i915-display/intel_display_params.o \ + i915-display/intel_display_power.o \ + i915-display/intel_display_power_map.o \ + i915-display/intel_display_power_well.o \ + i915-display/intel_display_trace.o \ + i915-display/intel_display_wa.o \ + i915-display/intel_dkl_phy.o \ + i915-display/intel_dmc.o \ + i915-display/intel_dp.o \ + i915-display/intel_dp_aux.o \ + i915-display/intel_dp_aux_backlight.o \ + i915-display/intel_dp_hdcp.o \ + i915-display/intel_dp_link_training.o \ + i915-display/intel_dp_mst.o \ + i915-display/intel_dpll.o \ + i915-display/intel_dpll_mgr.o \ + i915-display/intel_dpt_common.o \ + i915-display/intel_drrs.o \ + i915-display/intel_dsb.o \ + i915-display/intel_dsi.o \ + i915-display/intel_dsi_dcs_backlight.o \ + i915-display/intel_dsi_vbt.o \ + i915-display/intel_fb.o \ + i915-display/intel_fbc.o \ + i915-display/intel_fdi.o \ + i915-display/intel_fifo_underrun.o \ + i915-display/intel_frontbuffer.o \ + i915-display/intel_global_state.o \ + i915-display/intel_gmbus.o \ + i915-display/intel_hdcp.o \ + i915-display/intel_hdmi.o \ + i915-display/intel_hotplug.o \ + i915-display/intel_hotplug_irq.o \ + i915-display/intel_hti.o \ + i915-display/intel_link_bw.o \ + i915-display/intel_lspcon.o \ + i915-display/intel_modeset_lock.o \ + i915-display/intel_modeset_setup.o \ + i915-display/intel_modeset_verify.o \ + i915-display/intel_panel.o \ + i915-display/intel_pipe_crc.o \ + i915-display/intel_pmdemand.o \ + i915-display/intel_pps.o \ + i915-display/intel_psr.o \ + i915-display/intel_qp_tables.o \ + i915-display/intel_quirks.o \ + i915-display/intel_snps_phy.o \ + i915-display/intel_tc.o \ + i915-display/intel_vblank.o \ + i915-display/intel_vdsc.o \ + i915-display/intel_vga.o \ + i915-display/intel_vrr.o \ + i915-display/intel_wm.o \ + i915-display/skl_scaler.o \ + i915-display/skl_universal_plane.o \ + i915-display/skl_watermark.o + +ifeq ($(CONFIG_ACPI),y) + xe-$(CONFIG_DRM_XE_DISPLAY) += \ + i915-display/intel_acpi.o \ + i915-display/intel_opregion.o +endif + +ifeq ($(CONFIG_DRM_FBDEV_EMULATION),y) + xe-$(CONFIG_DRM_XE_DISPLAY) += i915-display/intel_fbdev.o +endif + +obj-$(CONFIG_DRM_XE) += xe.o +obj-$(CONFIG_DRM_XE_KUNIT_TEST) += tests/ + +# header test +hdrtest_find_args := -not -path xe_rtp_helpers.h +ifneq ($(CONFIG_DRM_XE_DISPLAY),y) + hdrtest_find_args += -not -path display/\* -not -path compat-i915-headers/\* -not -path xe_display.h +endif + +always-$(CONFIG_DRM_XE_WERROR) += \ + $(patsubst %.h,%.hdrtest, $(shell cd $(srctree)/$(src) && find * -name '*.h' $(hdrtest_find_args))) + +quiet_cmd_hdrtest = HDRTEST $(patsubst %.hdrtest,%.h,$@) + cmd_hdrtest = $(CC) -DHDRTEST $(filter-out $(CFLAGS_GCOV), $(c_flags)) -S -o /dev/null -x c /dev/null -include $<; touch $@ + +$(obj)/%.hdrtest: $(src)/%.h FORCE + $(call if_changed_dep,hdrtest) diff --git a/drivers/gpu/drm/xe/abi/gsc_command_header_abi.h b/drivers/gpu/drm/xe/abi/gsc_command_header_abi.h new file mode 100644 index 000000000000..a4c2646803b5 --- /dev/null +++ b/drivers/gpu/drm/xe/abi/gsc_command_header_abi.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _ABI_GSC_COMMAND_HEADER_ABI_H +#define _ABI_GSC_COMMAND_HEADER_ABI_H + +#include <linux/types.h> + +struct intel_gsc_mtl_header { + u32 validity_marker; +#define GSC_HECI_VALIDITY_MARKER 0xA578875A + + u8 heci_client_id; + + u8 reserved1; + + u16 header_version; +#define MTL_GSC_HEADER_VERSION 1 + + /* FW allows host to decide host_session handle as it sees fit. */ + u64 host_session_handle; + + /* handle generated by FW for messages that need to be re-submitted */ + u64 gsc_message_handle; + + u32 message_size; /* lower 20 bits only, upper 12 are reserved */ + + /* + * Flags mask: + * Bit 0: Pending + * Bit 1: Session Cleanup; + * Bits 2-15: Flags + * Bits 16-31: Extension Size + * According to internal spec flags are either input or output + * we distinguish the flags using OUTFLAG or INFLAG + */ + u32 flags; +#define GSC_OUTFLAG_MSG_PENDING BIT(0) +#define GSC_INFLAG_MSG_CLEANUP BIT(1) + + u32 status; +} __packed; + +#endif diff --git a/drivers/gpu/drm/xe/abi/gsc_mkhi_commands_abi.h b/drivers/gpu/drm/xe/abi/gsc_mkhi_commands_abi.h new file mode 100644 index 000000000000..ad4d041873ab --- /dev/null +++ b/drivers/gpu/drm/xe/abi/gsc_mkhi_commands_abi.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _ABI_GSC_MKHI_COMMANDS_ABI_H +#define _ABI_GSC_MKHI_COMMANDS_ABI_H + +#include <linux/types.h> + +/* Heci client ID for MKHI commands */ +#define HECI_MEADDRESS_MKHI 7 + +/* Generic MKHI header */ +struct gsc_mkhi_header { + u8 group_id; + u8 command; + u8 reserved; + u8 result; +} __packed; + +/* GFX_SRV commands */ +#define MKHI_GROUP_ID_GFX_SRV 0x30 + +#define MKHI_GFX_SRV_GET_HOST_COMPATIBILITY_VERSION (0x42) + +struct gsc_get_compatibility_version_in { + struct gsc_mkhi_header header; +} __packed; + +struct gsc_get_compatibility_version_out { + struct gsc_mkhi_header header; + u16 proj_major; + u16 compat_major; + u16 compat_minor; + u16 reserved[5]; +} __packed; + +#endif diff --git a/drivers/gpu/drm/xe/abi/gsc_pxp_commands_abi.h b/drivers/gpu/drm/xe/abi/gsc_pxp_commands_abi.h new file mode 100644 index 000000000000..57520809e48d --- /dev/null +++ b/drivers/gpu/drm/xe/abi/gsc_pxp_commands_abi.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _ABI_GSC_PXP_COMMANDS_ABI_H +#define _ABI_GSC_PXP_COMMANDS_ABI_H + +#include <linux/types.h> + +/* Heci client ID for PXP commands */ +#define HECI_MEADDRESS_PXP 17 + +#define PXP_APIVER(x, y) (((x) & 0xFFFF) << 16 | ((y) & 0xFFFF)) + +/* + * there are a lot of status codes for PXP, but we only define the cross-API + * common ones that we actually can handle in the kernel driver. Other failure + * codes should be printed to error msg for debug. + */ +enum pxp_status { + PXP_STATUS_SUCCESS = 0x0, + PXP_STATUS_ERROR_API_VERSION = 0x1002, + PXP_STATUS_NOT_READY = 0x100e, + PXP_STATUS_PLATFCONFIG_KF1_NOVERIF = 0x101a, + PXP_STATUS_PLATFCONFIG_KF1_BAD = 0x101f, + PXP_STATUS_OP_NOT_PERMITTED = 0x4013 +}; + +/* Common PXP FW message header */ +struct pxp_cmd_header { + u32 api_version; + u32 command_id; + union { + u32 status; /* out */ + u32 stream_id; /* in */ +#define PXP_CMDHDR_EXTDATA_SESSION_VALID GENMASK(0, 0) +#define PXP_CMDHDR_EXTDATA_APP_TYPE GENMASK(1, 1) +#define PXP_CMDHDR_EXTDATA_SESSION_ID GENMASK(17, 2) + }; + /* Length of the message (excluding the header) */ + u32 buffer_len; +} __packed; + +#define PXP43_CMDID_NEW_HUC_AUTH 0x0000003F /* MTL+ */ + +/* PXP-Input-Packet: HUC Auth-only */ +struct pxp43_new_huc_auth_in { + struct pxp_cmd_header header; + u64 huc_base_address; + u32 huc_size; +} __packed; + +/* PXP-Output-Packet: HUC Load and Authentication or Auth-only */ +struct pxp43_huc_auth_out { + struct pxp_cmd_header header; +} __packed; + +#endif diff --git a/drivers/gpu/drm/xe/abi/guc_actions_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_abi.h new file mode 100644 index 000000000000..3062e0e0d467 --- /dev/null +++ b/drivers/gpu/drm/xe/abi/guc_actions_abi.h @@ -0,0 +1,219 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2021 Intel Corporation + */ + +#ifndef _ABI_GUC_ACTIONS_ABI_H +#define _ABI_GUC_ACTIONS_ABI_H + +/** + * DOC: HOST2GUC_SELF_CFG + * + * This message is used by Host KMD to setup of the `GuC Self Config KLVs`_. + * + * This message must be sent as `MMIO HXG Message`_. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_HOST_ | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_REQUEST_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:16 | DATA0 = MBZ | + * | +-------+--------------------------------------------------------------+ + * | | 15:0 | ACTION = _`GUC_ACTION_HOST2GUC_SELF_CFG` = 0x0508 | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:16 | **KLV_KEY** - KLV key, see `GuC Self Config KLVs`_ | + * | +-------+--------------------------------------------------------------+ + * | | 15:0 | **KLV_LEN** - KLV length | + * | | | | + * | | | - 32 bit KLV = 1 | + * | | | - 64 bit KLV = 2 | + * +---+-------+--------------------------------------------------------------+ + * | 2 | 31:0 | **VALUE32** - Bits 31-0 of the KLV value | + * +---+-------+--------------------------------------------------------------+ + * | 3 | 31:0 | **VALUE64** - Bits 63-32 of the KLV value (**KLV_LEN** = 2) | + * +---+-------+--------------------------------------------------------------+ + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_GUC_ | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_RESPONSE_SUCCESS_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:0 | DATA0 = **NUM** - 1 if KLV was parsed, 0 if not recognized | + * +---+-------+--------------------------------------------------------------+ + */ +#define GUC_ACTION_HOST2GUC_SELF_CFG 0x0508 + +#define HOST2GUC_SELF_CFG_REQUEST_MSG_LEN (GUC_HXG_REQUEST_MSG_MIN_LEN + 3u) +#define HOST2GUC_SELF_CFG_REQUEST_MSG_0_MBZ GUC_HXG_REQUEST_MSG_0_DATA0 +#define HOST2GUC_SELF_CFG_REQUEST_MSG_1_KLV_KEY (0xffff << 16) +#define HOST2GUC_SELF_CFG_REQUEST_MSG_1_KLV_LEN (0xffff << 0) +#define HOST2GUC_SELF_CFG_REQUEST_MSG_2_VALUE32 GUC_HXG_REQUEST_MSG_n_DATAn +#define HOST2GUC_SELF_CFG_REQUEST_MSG_3_VALUE64 GUC_HXG_REQUEST_MSG_n_DATAn + +#define HOST2GUC_SELF_CFG_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN +#define HOST2GUC_SELF_CFG_RESPONSE_MSG_0_NUM GUC_HXG_RESPONSE_MSG_0_DATA0 + +/** + * DOC: HOST2GUC_CONTROL_CTB + * + * This H2G action allows Vf Host to enable or disable H2G and G2H `CT Buffer`_. + * + * This message must be sent as `MMIO HXG Message`_. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_HOST_ | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_REQUEST_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:16 | DATA0 = MBZ | + * | +-------+--------------------------------------------------------------+ + * | | 15:0 | ACTION = _`GUC_ACTION_HOST2GUC_CONTROL_CTB` = 0x4509 | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | **CONTROL** - control `CTB based communication`_ | + * | | | | + * | | | - _`GUC_CTB_CONTROL_DISABLE` = 0 | + * | | | - _`GUC_CTB_CONTROL_ENABLE` = 1 | + * +---+-------+--------------------------------------------------------------+ + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_GUC_ | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_RESPONSE_SUCCESS_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:0 | DATA0 = MBZ | + * +---+-------+--------------------------------------------------------------+ + */ +#define GUC_ACTION_HOST2GUC_CONTROL_CTB 0x4509 + +#define HOST2GUC_CONTROL_CTB_REQUEST_MSG_LEN (GUC_HXG_REQUEST_MSG_MIN_LEN + 1u) +#define HOST2GUC_CONTROL_CTB_REQUEST_MSG_0_MBZ GUC_HXG_REQUEST_MSG_0_DATA0 +#define HOST2GUC_CONTROL_CTB_REQUEST_MSG_1_CONTROL GUC_HXG_REQUEST_MSG_n_DATAn +#define GUC_CTB_CONTROL_DISABLE 0u +#define GUC_CTB_CONTROL_ENABLE 1u + +#define HOST2GUC_CONTROL_CTB_RESPONSE_MSG_LEN GUC_HXG_RESPONSE_MSG_MIN_LEN +#define HOST2GUC_CONTROL_CTB_RESPONSE_MSG_0_MBZ GUC_HXG_RESPONSE_MSG_0_DATA0 + +/* legacy definitions */ + +enum xe_guc_action { + XE_GUC_ACTION_DEFAULT = 0x0, + XE_GUC_ACTION_REQUEST_PREEMPTION = 0x2, + XE_GUC_ACTION_REQUEST_ENGINE_RESET = 0x3, + XE_GUC_ACTION_ALLOCATE_DOORBELL = 0x10, + XE_GUC_ACTION_DEALLOCATE_DOORBELL = 0x20, + XE_GUC_ACTION_LOG_BUFFER_FILE_FLUSH_COMPLETE = 0x30, + XE_GUC_ACTION_UK_LOG_ENABLE_LOGGING = 0x40, + XE_GUC_ACTION_FORCE_LOG_BUFFER_FLUSH = 0x302, + XE_GUC_ACTION_ENTER_S_STATE = 0x501, + XE_GUC_ACTION_EXIT_S_STATE = 0x502, + XE_GUC_ACTION_GLOBAL_SCHED_POLICY_CHANGE = 0x506, + XE_GUC_ACTION_SCHED_CONTEXT = 0x1000, + XE_GUC_ACTION_SCHED_CONTEXT_MODE_SET = 0x1001, + XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE = 0x1002, + XE_GUC_ACTION_SCHED_ENGINE_MODE_SET = 0x1003, + XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE = 0x1004, + XE_GUC_ACTION_SET_CONTEXT_PRIORITY = 0x1005, + XE_GUC_ACTION_SET_CONTEXT_EXECUTION_QUANTUM = 0x1006, + XE_GUC_ACTION_SET_CONTEXT_PREEMPTION_TIMEOUT = 0x1007, + XE_GUC_ACTION_CONTEXT_RESET_NOTIFICATION = 0x1008, + XE_GUC_ACTION_ENGINE_FAILURE_NOTIFICATION = 0x1009, + XE_GUC_ACTION_HOST2GUC_UPDATE_CONTEXT_POLICIES = 0x100B, + XE_GUC_ACTION_SETUP_PC_GUCRC = 0x3004, + XE_GUC_ACTION_AUTHENTICATE_HUC = 0x4000, + XE_GUC_ACTION_GET_HWCONFIG = 0x4100, + XE_GUC_ACTION_REGISTER_CONTEXT = 0x4502, + XE_GUC_ACTION_DEREGISTER_CONTEXT = 0x4503, + XE_GUC_ACTION_REGISTER_COMMAND_TRANSPORT_BUFFER = 0x4505, + XE_GUC_ACTION_DEREGISTER_COMMAND_TRANSPORT_BUFFER = 0x4506, + XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600, + XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601, + XE_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507, + XE_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A, + XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR = 0x6000, + XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC = 0x6002, + XE_GUC_ACTION_PAGE_FAULT_RES_DESC = 0x6003, + XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY = 0x6004, + XE_GUC_ACTION_TLB_INVALIDATION = 0x7000, + XE_GUC_ACTION_TLB_INVALIDATION_DONE = 0x7001, + XE_GUC_ACTION_TLB_INVALIDATION_ALL = 0x7002, + XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION = 0x8002, + XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE = 0x8003, + XE_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED = 0x8004, + XE_GUC_ACTION_NOTIFY_EXCEPTION = 0x8005, + XE_GUC_ACTION_LIMIT +}; + +enum xe_guc_rc_options { + XE_GUCRC_HOST_CONTROL, + XE_GUCRC_FIRMWARE_CONTROL, +}; + +enum xe_guc_preempt_options { + XE_GUC_PREEMPT_OPTION_DROP_WORK_Q = 0x4, + XE_GUC_PREEMPT_OPTION_DROP_SUBMIT_Q = 0x8, +}; + +enum xe_guc_report_status { + XE_GUC_REPORT_STATUS_UNKNOWN = 0x0, + XE_GUC_REPORT_STATUS_ACKED = 0x1, + XE_GUC_REPORT_STATUS_ERROR = 0x2, + XE_GUC_REPORT_STATUS_COMPLETE = 0x4, +}; + +enum xe_guc_sleep_state_status { + XE_GUC_SLEEP_STATE_SUCCESS = 0x1, + XE_GUC_SLEEP_STATE_PREEMPT_TO_IDLE_FAILED = 0x2, + XE_GUC_SLEEP_STATE_ENGINE_RESET_FAILED = 0x3 +#define XE_GUC_SLEEP_STATE_INVALID_MASK 0x80000000 +}; + +#define GUC_LOG_CONTROL_LOGGING_ENABLED (1 << 0) +#define GUC_LOG_CONTROL_VERBOSITY_SHIFT 4 +#define GUC_LOG_CONTROL_VERBOSITY_MASK (0xF << GUC_LOG_CONTROL_VERBOSITY_SHIFT) +#define GUC_LOG_CONTROL_DEFAULT_LOGGING (1 << 8) + +#define XE_GUC_TLB_INVAL_TYPE_SHIFT 0 +#define XE_GUC_TLB_INVAL_MODE_SHIFT 8 +/* Flush PPC or SMRO caches along with TLB invalidation request */ +#define XE_GUC_TLB_INVAL_FLUSH_CACHE (1 << 31) + +enum xe_guc_tlb_invalidation_type { + XE_GUC_TLB_INVAL_FULL = 0x0, + XE_GUC_TLB_INVAL_PAGE_SELECTIVE = 0x1, + XE_GUC_TLB_INVAL_PAGE_SELECTIVE_CTX = 0x2, + XE_GUC_TLB_INVAL_GUC = 0x3, +}; + +/* + * 0: Heavy mode of Invalidation: + * The pipeline of the engine(s) for which the invalidation is targeted to is + * blocked, and all the in-flight transactions are guaranteed to be Globally + * Observed before completing the TLB invalidation + * 1: Lite mode of Invalidation: + * TLBs of the targeted engine(s) are immediately invalidated. + * In-flight transactions are NOT guaranteed to be Globally Observed before + * completing TLB invalidation. + * Light Invalidation Mode is to be used only when + * it can be guaranteed (by SW) that the address translations remain invariant + * for the in-flight transactions across the TLB invalidation. In other words, + * this mode can be used when the TLB invalidation is intended to clear out the + * stale cached translations that are no longer in use. Light Invalidation Mode + * is much faster than the Heavy Invalidation Mode, as it does not wait for the + * in-flight transactions to be GOd. + */ +enum xe_guc_tlb_inval_mode { + XE_GUC_TLB_INVAL_MODE_HEAVY = 0x0, + XE_GUC_TLB_INVAL_MODE_LITE = 0x1, +}; + +#endif diff --git a/drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h b/drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h new file mode 100644 index 000000000000..811add10c30d --- /dev/null +++ b/drivers/gpu/drm/xe/abi/guc_actions_slpc_abi.h @@ -0,0 +1,249 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _GUC_ACTIONS_SLPC_ABI_H_ +#define _GUC_ACTIONS_SLPC_ABI_H_ + +#include <linux/types.h> + +/** + * DOC: SLPC SHARED DATA STRUCTURE + * + * +----+------+--------------------------------------------------------------+ + * | CL | Bytes| Description | + * +====+======+==============================================================+ + * | 1 | 0-3 | SHARED DATA SIZE | + * | +------+--------------------------------------------------------------+ + * | | 4-7 | GLOBAL STATE | + * | +------+--------------------------------------------------------------+ + * | | 8-11 | DISPLAY DATA ADDRESS | + * | +------+--------------------------------------------------------------+ + * | | 12:63| PADDING | + * +----+------+--------------------------------------------------------------+ + * | | 0:63 | PADDING(PLATFORM INFO) | + * +----+------+--------------------------------------------------------------+ + * | 3 | 0-3 | TASK STATE DATA | + * + +------+--------------------------------------------------------------+ + * | | 4:63 | PADDING | + * +----+------+--------------------------------------------------------------+ + * |4-21|0:1087| OVERRIDE PARAMS AND BIT FIELDS | + * +----+------+--------------------------------------------------------------+ + * | | | PADDING + EXTRA RESERVED PAGE | + * +----+------+--------------------------------------------------------------+ + */ + +/* + * SLPC exposes certain parameters for global configuration by the host. + * These are referred to as override parameters, because in most cases + * the host will not need to modify the default values used by SLPC. + * SLPC remembers the default values which allows the host to easily restore + * them by simply unsetting the override. The host can set or unset override + * parameters during SLPC (re-)initialization using the SLPC Reset event. + * The host can also set or unset override parameters on the fly using the + * Parameter Set and Parameter Unset events + */ + +#define SLPC_MAX_OVERRIDE_PARAMETERS 256 +#define SLPC_OVERRIDE_BITFIELD_SIZE \ + (SLPC_MAX_OVERRIDE_PARAMETERS / 32) + +#define SLPC_PAGE_SIZE_BYTES 4096 +#define SLPC_CACHELINE_SIZE_BYTES 64 +#define SLPC_SHARED_DATA_SIZE_BYTE_HEADER SLPC_CACHELINE_SIZE_BYTES +#define SLPC_SHARED_DATA_SIZE_BYTE_PLATFORM_INFO SLPC_CACHELINE_SIZE_BYTES +#define SLPC_SHARED_DATA_SIZE_BYTE_TASK_STATE SLPC_CACHELINE_SIZE_BYTES +#define SLPC_SHARED_DATA_MODE_DEFN_TABLE_SIZE SLPC_PAGE_SIZE_BYTES +#define SLPC_SHARED_DATA_SIZE_BYTE_MAX (2 * SLPC_PAGE_SIZE_BYTES) + +/* + * Cacheline size aligned (Total size needed for + * SLPM_KMD_MAX_OVERRIDE_PARAMETERS=256 is 1088 bytes) + */ +#define SLPC_OVERRIDE_PARAMS_TOTAL_BYTES (((((SLPC_MAX_OVERRIDE_PARAMETERS * 4) \ + + ((SLPC_MAX_OVERRIDE_PARAMETERS / 32) * 4)) \ + + (SLPC_CACHELINE_SIZE_BYTES - 1)) / SLPC_CACHELINE_SIZE_BYTES) * \ + SLPC_CACHELINE_SIZE_BYTES) + +#define SLPC_SHARED_DATA_SIZE_BYTE_OTHER (SLPC_SHARED_DATA_SIZE_BYTE_MAX - \ + (SLPC_SHARED_DATA_SIZE_BYTE_HEADER \ + + SLPC_SHARED_DATA_SIZE_BYTE_PLATFORM_INFO \ + + SLPC_SHARED_DATA_SIZE_BYTE_TASK_STATE \ + + SLPC_OVERRIDE_PARAMS_TOTAL_BYTES \ + + SLPC_SHARED_DATA_MODE_DEFN_TABLE_SIZE)) + +enum slpc_task_enable { + SLPC_PARAM_TASK_DEFAULT = 0, + SLPC_PARAM_TASK_ENABLED, + SLPC_PARAM_TASK_DISABLED, + SLPC_PARAM_TASK_UNKNOWN +}; + +enum slpc_global_state { + SLPC_GLOBAL_STATE_NOT_RUNNING = 0, + SLPC_GLOBAL_STATE_INITIALIZING = 1, + SLPC_GLOBAL_STATE_RESETTING = 2, + SLPC_GLOBAL_STATE_RUNNING = 3, + SLPC_GLOBAL_STATE_SHUTTING_DOWN = 4, + SLPC_GLOBAL_STATE_ERROR = 5 +}; + +enum slpc_param_id { + SLPC_PARAM_TASK_ENABLE_GTPERF = 0, + SLPC_PARAM_TASK_DISABLE_GTPERF = 1, + SLPC_PARAM_TASK_ENABLE_BALANCER = 2, + SLPC_PARAM_TASK_DISABLE_BALANCER = 3, + SLPC_PARAM_TASK_ENABLE_DCC = 4, + SLPC_PARAM_TASK_DISABLE_DCC = 5, + SLPC_PARAM_GLOBAL_MIN_GT_UNSLICE_FREQ_MHZ = 6, + SLPC_PARAM_GLOBAL_MAX_GT_UNSLICE_FREQ_MHZ = 7, + SLPC_PARAM_GLOBAL_MIN_GT_SLICE_FREQ_MHZ = 8, + SLPC_PARAM_GLOBAL_MAX_GT_SLICE_FREQ_MHZ = 9, + SLPC_PARAM_GTPERF_THRESHOLD_MAX_FPS = 10, + SLPC_PARAM_GLOBAL_DISABLE_GT_FREQ_MANAGEMENT = 11, + SLPC_PARAM_GTPERF_ENABLE_FRAMERATE_STALLING = 12, + SLPC_PARAM_GLOBAL_DISABLE_RC6_MODE_CHANGE = 13, + SLPC_PARAM_GLOBAL_OC_UNSLICE_FREQ_MHZ = 14, + SLPC_PARAM_GLOBAL_OC_SLICE_FREQ_MHZ = 15, + SLPC_PARAM_GLOBAL_ENABLE_IA_GT_BALANCING = 16, + SLPC_PARAM_GLOBAL_ENABLE_ADAPTIVE_BURST_TURBO = 17, + SLPC_PARAM_GLOBAL_ENABLE_EVAL_MODE = 18, + SLPC_PARAM_GLOBAL_ENABLE_BALANCER_IN_NON_GAMING_MODE = 19, + SLPC_PARAM_GLOBAL_RT_MODE_TURBO_FREQ_DELTA_MHZ = 20, + SLPC_PARAM_PWRGATE_RC_MODE = 21, + SLPC_PARAM_EDR_MODE_COMPUTE_TIMEOUT_MS = 22, + SLPC_PARAM_EDR_QOS_FREQ_MHZ = 23, + SLPC_PARAM_MEDIA_FF_RATIO_MODE = 24, + SLPC_PARAM_ENABLE_IA_FREQ_LIMITING = 25, + SLPC_PARAM_STRATEGIES = 26, + SLPC_PARAM_POWER_PROFILE = 27, + SLPC_PARAM_IGNORE_EFFICIENT_FREQUENCY = 28, + SLPC_MAX_PARAM = 32, +}; + +enum slpc_media_ratio_mode { + SLPC_MEDIA_RATIO_MODE_DYNAMIC_CONTROL = 0, + SLPC_MEDIA_RATIO_MODE_FIXED_ONE_TO_ONE = 1, + SLPC_MEDIA_RATIO_MODE_FIXED_ONE_TO_TWO = 2, +}; + +enum slpc_gucrc_mode { + SLPC_GUCRC_MODE_HW = 0, + SLPC_GUCRC_MODE_GUCRC_NO_RC6 = 1, + SLPC_GUCRC_MODE_GUCRC_STATIC_TIMEOUT = 2, + SLPC_GUCRC_MODE_GUCRC_DYNAMIC_HYSTERESIS = 3, + + SLPC_GUCRC_MODE_MAX, +}; + +enum slpc_event_id { + SLPC_EVENT_RESET = 0, + SLPC_EVENT_SHUTDOWN = 1, + SLPC_EVENT_PLATFORM_INFO_CHANGE = 2, + SLPC_EVENT_DISPLAY_MODE_CHANGE = 3, + SLPC_EVENT_FLIP_COMPLETE = 4, + SLPC_EVENT_QUERY_TASK_STATE = 5, + SLPC_EVENT_PARAMETER_SET = 6, + SLPC_EVENT_PARAMETER_UNSET = 7, +}; + +struct slpc_task_state_data { + union { + u32 task_status_padding; + struct { + u32 status; +#define SLPC_GTPERF_TASK_ENABLED REG_BIT(0) +#define SLPC_DCC_TASK_ENABLED REG_BIT(11) +#define SLPC_IN_DCC REG_BIT(12) +#define SLPC_BALANCER_ENABLED REG_BIT(15) +#define SLPC_IBC_TASK_ENABLED REG_BIT(16) +#define SLPC_BALANCER_IA_LMT_ENABLED REG_BIT(17) +#define SLPC_BALANCER_IA_LMT_ACTIVE REG_BIT(18) + }; + }; + union { + u32 freq_padding; + struct { +#define SLPC_MAX_UNSLICE_FREQ_MASK REG_GENMASK(7, 0) +#define SLPC_MIN_UNSLICE_FREQ_MASK REG_GENMASK(15, 8) +#define SLPC_MAX_SLICE_FREQ_MASK REG_GENMASK(23, 16) +#define SLPC_MIN_SLICE_FREQ_MASK REG_GENMASK(31, 24) + u32 freq; + }; + }; +} __packed; + +struct slpc_shared_data_header { + /* Total size in bytes of this shared buffer. */ + u32 size; + u32 global_state; + u32 display_data_addr; +} __packed; + +struct slpc_override_params { + u32 bits[SLPC_OVERRIDE_BITFIELD_SIZE]; + u32 values[SLPC_MAX_OVERRIDE_PARAMETERS]; +} __packed; + +struct slpc_shared_data { + struct slpc_shared_data_header header; + u8 shared_data_header_pad[SLPC_SHARED_DATA_SIZE_BYTE_HEADER - + sizeof(struct slpc_shared_data_header)]; + + u8 platform_info_pad[SLPC_SHARED_DATA_SIZE_BYTE_PLATFORM_INFO]; + + struct slpc_task_state_data task_state_data; + u8 task_state_data_pad[SLPC_SHARED_DATA_SIZE_BYTE_TASK_STATE - + sizeof(struct slpc_task_state_data)]; + + struct slpc_override_params override_params; + u8 override_params_pad[SLPC_OVERRIDE_PARAMS_TOTAL_BYTES - + sizeof(struct slpc_override_params)]; + + u8 shared_data_pad[SLPC_SHARED_DATA_SIZE_BYTE_OTHER]; + + /* PAGE 2 (4096 bytes), mode based parameter will be removed soon */ + u8 reserved_mode_definition[4096]; +} __packed; + +/** + * DOC: SLPC H2G MESSAGE FORMAT + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN = GUC_HXG_ORIGIN_HOST_ | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_REQUEST_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:16 | DATA0 = MBZ | + * | +-------+--------------------------------------------------------------+ + * | | 15:0 | ACTION = _`GUC_ACTION_HOST2GUC_PC_SLPM_REQUEST` = 0x3003 | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:8 | **EVENT_ID** | + * + +-------+--------------------------------------------------------------+ + * | | 7:0 | **EVENT_ARGC** - number of data arguments | + * +---+-------+--------------------------------------------------------------+ + * | 2 | 31:0 | **EVENT_DATA1** | + * +---+-------+--------------------------------------------------------------+ + * |...| 31:0 | ... | + * +---+-------+--------------------------------------------------------------+ + * |2+n| 31:0 | **EVENT_DATAn** | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_ACTION_HOST2GUC_PC_SLPC_REQUEST 0x3003 + +#define HOST2GUC_PC_SLPC_REQUEST_MSG_MIN_LEN \ + (GUC_HXG_REQUEST_MSG_MIN_LEN + 1u) +#define HOST2GUC_PC_SLPC_EVENT_MAX_INPUT_ARGS 9 +#define HOST2GUC_PC_SLPC_REQUEST_MSG_MAX_LEN \ + (HOST2GUC_PC_SLPC_REQUEST_REQUEST_MSG_MIN_LEN + \ + HOST2GUC_PC_SLPC_EVENT_MAX_INPUT_ARGS) +#define HOST2GUC_PC_SLPC_REQUEST_MSG_0_MBZ GUC_HXG_REQUEST_MSG_0_DATA0 +#define HOST2GUC_PC_SLPC_REQUEST_MSG_1_EVENT_ID (0xff << 8) +#define HOST2GUC_PC_SLPC_REQUEST_MSG_1_EVENT_ARGC (0xff << 0) +#define HOST2GUC_PC_SLPC_REQUEST_MSG_N_EVENT_DATA_N GUC_HXG_REQUEST_MSG_n_DATAn + +#endif diff --git a/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h b/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h new file mode 100644 index 000000000000..3b83f907ece4 --- /dev/null +++ b/drivers/gpu/drm/xe/abi/guc_communication_ctb_abi.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2021 Intel Corporation + */ + +#ifndef _ABI_GUC_COMMUNICATION_CTB_ABI_H +#define _ABI_GUC_COMMUNICATION_CTB_ABI_H + +#include <linux/types.h> +#include <linux/build_bug.h> + +#include "guc_messages_abi.h" + +/** + * DOC: CT Buffer + * + * Circular buffer used to send `CTB Message`_ + */ + +/** + * DOC: CTB Descriptor + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31:0 | **HEAD** - offset (in dwords) to the last dword that was | + * | | | read from the `CT Buffer`_. | + * | | | It can only be updated by the receiver. | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | **TAIL** - offset (in dwords) to the last dword that was | + * | | | written to the `CT Buffer`_. | + * | | | It can only be updated by the sender. | + * +---+-------+--------------------------------------------------------------+ + * | 2 | 31:0 | **STATUS** - status of the CTB | + * | | | | + * | | | - _`GUC_CTB_STATUS_NO_ERROR` = 0 (normal operation) | + * | | | - _`GUC_CTB_STATUS_OVERFLOW` = 1 (head/tail too large) | + * | | | - _`GUC_CTB_STATUS_UNDERFLOW` = 2 (truncated message) | + * | | | - _`GUC_CTB_STATUS_MISMATCH` = 4 (head/tail modified) | + * +---+-------+--------------------------------------------------------------+ + * |...| | RESERVED = MBZ | + * +---+-------+--------------------------------------------------------------+ + * | 15| 31:0 | RESERVED = MBZ | + * +---+-------+--------------------------------------------------------------+ + */ + +struct guc_ct_buffer_desc { + u32 head; + u32 tail; + u32 status; +#define GUC_CTB_STATUS_NO_ERROR 0 +#define GUC_CTB_STATUS_OVERFLOW (1 << 0) +#define GUC_CTB_STATUS_UNDERFLOW (1 << 1) +#define GUC_CTB_STATUS_MISMATCH (1 << 2) + u32 reserved[13]; +} __packed; +static_assert(sizeof(struct guc_ct_buffer_desc) == 64); + +/** + * DOC: CTB Message + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31:16 | **FENCE** - message identifier | + * | +-------+--------------------------------------------------------------+ + * | | 15:12 | **FORMAT** - format of the CTB message | + * | | | - _`GUC_CTB_FORMAT_HXG` = 0 - see `CTB HXG Message`_ | + * | +-------+--------------------------------------------------------------+ + * | | 11:8 | **RESERVED** | + * | +-------+--------------------------------------------------------------+ + * | | 7:0 | **NUM_DWORDS** - length of the CTB message (w/o header) | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | optional (depends on FORMAT) | + * +---+-------+ | + * |...| | | + * +---+-------+ | + * | n | 31:0 | | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_CTB_HDR_LEN 1u +#define GUC_CTB_MSG_MIN_LEN GUC_CTB_HDR_LEN +#define GUC_CTB_MSG_MAX_LEN 256u +#define GUC_CTB_MSG_0_FENCE (0xffff << 16) +#define GUC_CTB_MSG_0_FORMAT (0xf << 12) +#define GUC_CTB_FORMAT_HXG 0u +#define GUC_CTB_MSG_0_RESERVED (0xf << 8) +#define GUC_CTB_MSG_0_NUM_DWORDS (0xff << 0) + +/** + * DOC: CTB HXG Message + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31:16 | FENCE | + * | +-------+--------------------------------------------------------------+ + * | | 15:12 | FORMAT = GUC_CTB_FORMAT_HXG_ | + * | +-------+--------------------------------------------------------------+ + * | | 11:8 | RESERVED = MBZ | + * | +-------+--------------------------------------------------------------+ + * | | 7:0 | NUM_DWORDS = length (in dwords) of the embedded HXG message | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | | + * +---+-------+ | + * |...| | [Embedded `HXG Message`_] | + * +---+-------+ | + * | n | 31:0 | | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_CTB_HXG_MSG_MIN_LEN (GUC_CTB_MSG_MIN_LEN + GUC_HXG_MSG_MIN_LEN) +#define GUC_CTB_HXG_MSG_MAX_LEN GUC_CTB_MSG_MAX_LEN + +/** + * DOC: CTB based communication + * + * The CTB (command transport buffer) communication between Host and GuC + * is based on u32 data stream written to the shared buffer. One buffer can + * be used to transmit data only in one direction (one-directional channel). + * + * Current status of the each buffer is maintained in the `CTB Descriptor`_. + * Each message in data stream is encoded as `CTB HXG Message`_. + */ + +#endif diff --git a/drivers/gpu/drm/xe/abi/guc_communication_mmio_abi.h b/drivers/gpu/drm/xe/abi/guc_communication_mmio_abi.h new file mode 100644 index 000000000000..ef538e34f894 --- /dev/null +++ b/drivers/gpu/drm/xe/abi/guc_communication_mmio_abi.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2021 Intel Corporation + */ + +#ifndef _ABI_GUC_COMMUNICATION_MMIO_ABI_H +#define _ABI_GUC_COMMUNICATION_MMIO_ABI_H + +/** + * DOC: GuC MMIO based communication + * + * The MMIO based communication between Host and GuC relies on special + * hardware registers which format could be defined by the software + * (so called scratch registers). + * + * Each MMIO based message, both Host to GuC (H2G) and GuC to Host (G2H) + * messages, which maximum length depends on number of available scratch + * registers, is directly written into those scratch registers. + * + * For Gen9+, there are 16 software scratch registers 0xC180-0xC1B8, + * but no H2G command takes more than 4 parameters and the GuC firmware + * itself uses an 4-element array to store the H2G message. + * + * For Gen11+, there are additional 4 registers 0x190240-0x19024C, which + * are, regardless on lower count, preferred over legacy ones. + * + * The MMIO based communication is mainly used during driver initialization + * phase to setup the `CTB based communication`_ that will be used afterwards. + */ + +#define GUC_MAX_MMIO_MSG_LEN 4 + +/** + * DOC: MMIO HXG Message + * + * Format of the MMIO messages follows definitions of `HXG Message`_. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31:0 | | + * +---+-------+ | + * |...| | [Embedded `HXG Message`_] | + * +---+-------+ | + * | n | 31:0 | | + * +---+-------+--------------------------------------------------------------+ + */ + +#endif diff --git a/drivers/gpu/drm/xe/abi/guc_errors_abi.h b/drivers/gpu/drm/xe/abi/guc_errors_abi.h new file mode 100644 index 000000000000..ec83551bf9c0 --- /dev/null +++ b/drivers/gpu/drm/xe/abi/guc_errors_abi.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2021 Intel Corporation + */ + +#ifndef _ABI_GUC_ERRORS_ABI_H +#define _ABI_GUC_ERRORS_ABI_H + +enum xe_guc_response_status { + XE_GUC_RESPONSE_STATUS_SUCCESS = 0x0, + XE_GUC_RESPONSE_STATUS_GENERIC_FAIL = 0xF000, +}; + +enum xe_guc_load_status { + XE_GUC_LOAD_STATUS_DEFAULT = 0x00, + XE_GUC_LOAD_STATUS_START = 0x01, + XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH = 0x02, + XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH = 0x03, + XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE = 0x04, + XE_GUC_LOAD_STATUS_GDT_DONE = 0x10, + XE_GUC_LOAD_STATUS_IDT_DONE = 0x20, + XE_GUC_LOAD_STATUS_LAPIC_DONE = 0x30, + XE_GUC_LOAD_STATUS_GUCINT_DONE = 0x40, + XE_GUC_LOAD_STATUS_DPC_READY = 0x50, + XE_GUC_LOAD_STATUS_DPC_ERROR = 0x60, + XE_GUC_LOAD_STATUS_EXCEPTION = 0x70, + XE_GUC_LOAD_STATUS_INIT_DATA_INVALID = 0x71, + XE_GUC_LOAD_STATUS_PXP_TEARDOWN_CTRL_ENABLED = 0x72, + XE_GUC_LOAD_STATUS_INVALID_INIT_DATA_RANGE_START, + XE_GUC_LOAD_STATUS_MPU_DATA_INVALID = 0x73, + XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID = 0x74, + XE_GUC_LOAD_STATUS_INVALID_INIT_DATA_RANGE_END, + + XE_GUC_LOAD_STATUS_READY = 0xF0, +}; + +#endif diff --git a/drivers/gpu/drm/xe/abi/guc_klvs_abi.h b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h new file mode 100644 index 000000000000..47094b9b044c --- /dev/null +++ b/drivers/gpu/drm/xe/abi/guc_klvs_abi.h @@ -0,0 +1,322 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _ABI_GUC_KLVS_ABI_H +#define _ABI_GUC_KLVS_ABI_H + +#include <linux/types.h> + +/** + * DOC: GuC KLV + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31:16 | **KEY** - KLV key identifier | + * | | | - `GuC Self Config KLVs`_ | + * | | | - `GuC VGT Policy KLVs`_ | + * | | | - `GuC VF Configuration KLVs`_ | + * | | | | + * | +-------+--------------------------------------------------------------+ + * | | 15:0 | **LEN** - length of VALUE (in 32bit dwords) | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | **VALUE** - actual value of the KLV (format depends on KEY) | + * +---+-------+ | + * |...| | | + * +---+-------+ | + * | n | 31:0 | | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_KLV_LEN_MIN 1u +#define GUC_KLV_0_KEY (0xffff << 16) +#define GUC_KLV_0_LEN (0xffff << 0) +#define GUC_KLV_n_VALUE (0xffffffff << 0) + +/** + * DOC: GuC Self Config KLVs + * + * `GuC KLV`_ keys available for use with HOST2GUC_SELF_CFG_. + * + * _`GUC_KLV_SELF_CFG_MEMIRQ_STATUS_ADDR` : 0x0900 + * Refers to 64 bit Global Gfx address (in bytes) of memory based interrupts + * status vector for use by the GuC. + * + * _`GUC_KLV_SELF_CFG_MEMIRQ_SOURCE_ADDR` : 0x0901 + * Refers to 64 bit Global Gfx address (in bytes) of memory based interrupts + * source vector for use by the GuC. + * + * _`GUC_KLV_SELF_CFG_H2G_CTB_ADDR` : 0x0902 + * Refers to 64 bit Global Gfx address of H2G `CT Buffer`_. + * Should be above WOPCM address but below APIC base address for native mode. + * + * _`GUC_KLV_SELF_CFG_H2G_CTB_DESCRIPTOR_ADDR : 0x0903 + * Refers to 64 bit Global Gfx address of H2G `CTB Descriptor`_. + * Should be above WOPCM address but below APIC base address for native mode. + * + * _`GUC_KLV_SELF_CFG_H2G_CTB_SIZE : 0x0904 + * Refers to size of H2G `CT Buffer`_ in bytes. + * Should be a multiple of 4K. + * + * _`GUC_KLV_SELF_CFG_G2H_CTB_ADDR : 0x0905 + * Refers to 64 bit Global Gfx address of G2H `CT Buffer`_. + * Should be above WOPCM address but below APIC base address for native mode. + * + * _GUC_KLV_SELF_CFG_G2H_CTB_DESCRIPTOR_ADDR : 0x0906 + * Refers to 64 bit Global Gfx address of G2H `CTB Descriptor`_. + * Should be above WOPCM address but below APIC base address for native mode. + * + * _GUC_KLV_SELF_CFG_G2H_CTB_SIZE : 0x0907 + * Refers to size of G2H `CT Buffer`_ in bytes. + * Should be a multiple of 4K. + */ + +#define GUC_KLV_SELF_CFG_MEMIRQ_STATUS_ADDR_KEY 0x0900 +#define GUC_KLV_SELF_CFG_MEMIRQ_STATUS_ADDR_LEN 2u + +#define GUC_KLV_SELF_CFG_MEMIRQ_SOURCE_ADDR_KEY 0x0901 +#define GUC_KLV_SELF_CFG_MEMIRQ_SOURCE_ADDR_LEN 2u + +#define GUC_KLV_SELF_CFG_H2G_CTB_ADDR_KEY 0x0902 +#define GUC_KLV_SELF_CFG_H2G_CTB_ADDR_LEN 2u + +#define GUC_KLV_SELF_CFG_H2G_CTB_DESCRIPTOR_ADDR_KEY 0x0903 +#define GUC_KLV_SELF_CFG_H2G_CTB_DESCRIPTOR_ADDR_LEN 2u + +#define GUC_KLV_SELF_CFG_H2G_CTB_SIZE_KEY 0x0904 +#define GUC_KLV_SELF_CFG_H2G_CTB_SIZE_LEN 1u + +#define GUC_KLV_SELF_CFG_G2H_CTB_ADDR_KEY 0x0905 +#define GUC_KLV_SELF_CFG_G2H_CTB_ADDR_LEN 2u + +#define GUC_KLV_SELF_CFG_G2H_CTB_DESCRIPTOR_ADDR_KEY 0x0906 +#define GUC_KLV_SELF_CFG_G2H_CTB_DESCRIPTOR_ADDR_LEN 2u + +#define GUC_KLV_SELF_CFG_G2H_CTB_SIZE_KEY 0x0907 +#define GUC_KLV_SELF_CFG_G2H_CTB_SIZE_LEN 1u + +/* + * Per context scheduling policy update keys. + */ +enum { + GUC_CONTEXT_POLICIES_KLV_ID_EXECUTION_QUANTUM = 0x2001, + GUC_CONTEXT_POLICIES_KLV_ID_PREEMPTION_TIMEOUT = 0x2002, + GUC_CONTEXT_POLICIES_KLV_ID_SCHEDULING_PRIORITY = 0x2003, + GUC_CONTEXT_POLICIES_KLV_ID_PREEMPT_TO_IDLE_ON_QUANTUM_EXPIRY = 0x2004, + GUC_CONTEXT_POLICIES_KLV_ID_SLPM_GT_FREQUENCY = 0x2005, + + GUC_CONTEXT_POLICIES_KLV_NUM_IDS = 5, +}; + +/** + * DOC: GuC VGT Policy KLVs + * + * `GuC KLV`_ keys available for use with PF2GUC_UPDATE_VGT_POLICY. + * + * _`GUC_KLV_VGT_POLICY_SCHED_IF_IDLE` : 0x8001 + * This config sets whether strict scheduling is enabled whereby any VF + * that doesn’t have work to submit is still allocated a fixed execution + * time-slice to ensure active VFs execution is always consitent even + * during other VF reprovisiong / rebooting events. Changing this KLV + * impacts all VFs and takes effect on the next VF-Switch event. + * + * :0: don't schedule idle (default) + * :1: schedule if idle + * + * _`GUC_KLV_VGT_POLICY_ADVERSE_SAMPLE_PERIOD` : 0x8002 + * This config sets the sample period for tracking adverse event counters. + * A sample period is the period in millisecs during which events are counted. + * This is applicable for all the VFs. + * + * :0: adverse events are not counted (default) + * :n: sample period in milliseconds + * + * _`GUC_KLV_VGT_POLICY_RESET_AFTER_VF_SWITCH` : 0x8D00 + * This enum is to reset utilized HW engine after VF Switch (i.e to clean + * up Stale HW register left behind by previous VF) + * + * :0: don't reset (default) + * :1: reset + */ + +#define GUC_KLV_VGT_POLICY_SCHED_IF_IDLE_KEY 0x8001 +#define GUC_KLV_VGT_POLICY_SCHED_IF_IDLE_LEN 1u + +#define GUC_KLV_VGT_POLICY_ADVERSE_SAMPLE_PERIOD_KEY 0x8002 +#define GUC_KLV_VGT_POLICY_ADVERSE_SAMPLE_PERIOD_LEN 1u + +#define GUC_KLV_VGT_POLICY_RESET_AFTER_VF_SWITCH_KEY 0x8D00 +#define GUC_KLV_VGT_POLICY_RESET_AFTER_VF_SWITCH_LEN 1u + +/** + * DOC: GuC VF Configuration KLVs + * + * `GuC KLV`_ keys available for use with PF2GUC_UPDATE_VF_CFG. + * + * _`GUC_KLV_VF_CFG_GGTT_START` : 0x0001 + * A 4K aligned start GTT address/offset assigned to VF. + * Value is 64 bits. + * + * _`GUC_KLV_VF_CFG_GGTT_SIZE` : 0x0002 + * A 4K aligned size of GGTT assigned to VF. + * Value is 64 bits. + * + * _`GUC_KLV_VF_CFG_LMEM_SIZE` : 0x0003 + * A 2M aligned size of local memory assigned to VF. + * Value is 64 bits. + * + * _`GUC_KLV_VF_CFG_NUM_CONTEXTS` : 0x0004 + * Refers to the number of contexts allocated to this VF. + * + * :0: no contexts (default) + * :1-65535: number of contexts (Gen12) + * + * _`GUC_KLV_VF_CFG_TILE_MASK` : 0x0005 + * For multi-tiled products, this field contains the bitwise-OR of tiles + * assigned to the VF. Bit-0-set means VF has access to Tile-0, + * Bit-31-set means VF has access to Tile-31, and etc. + * At least one tile will always be allocated. + * If all bits are zero, VF KMD should treat this as a fatal error. + * For, single-tile products this KLV config is ignored. + * + * _`GUC_KLV_VF_CFG_NUM_DOORBELLS` : 0x0006 + * Refers to the number of doorbells allocated to this VF. + * + * :0: no doorbells (default) + * :1-255: number of doorbells (Gen12) + * + * _`GUC_KLV_VF_CFG_EXEC_QUANTUM` : 0x8A01 + * This config sets the VFs-execution-quantum in milliseconds. + * GUC will attempt to obey the maximum values as much as HW is capable + * of and this will never be perfectly-exact (accumulated nano-second + * granularity) since the GPUs clock time runs off a different crystal + * from the CPUs clock. Changing this KLV on a VF that is currently + * running a context wont take effect until a new context is scheduled in. + * That said, when the PF is changing this value from 0xFFFFFFFF to + * something else, it might never take effect if the VF is running an + * inifinitely long compute or shader kernel. In such a scenario, the + * PF would need to trigger a VM PAUSE and then change the KLV to force + * it to take effect. Such cases might typically happen on a 1PF+1VF + * Virtualization config enabled for heavier workloads like AI/ML. + * + * :0: infinite exec quantum (default) + * + * _`GUC_KLV_VF_CFG_PREEMPT_TIMEOUT` : 0x8A02 + * This config sets the VF-preemption-timeout in microseconds. + * GUC will attempt to obey the minimum and maximum values as much as + * HW is capable and this will never be perfectly-exact (accumulated + * nano-second granularity) since the GPUs clock time runs off a + * different crystal from the CPUs clock. Changing this KLV on a VF + * that is currently running a context wont take effect until a new + * context is scheduled in. + * That said, when the PF is changing this value from 0xFFFFFFFF to + * something else, it might never take effect if the VF is running an + * inifinitely long compute or shader kernel. + * In this case, the PF would need to trigger a VM PAUSE and then change + * the KLV to force it to take effect. Such cases might typically happen + * on a 1PF+1VF Virtualization config enabled for heavier workloads like + * AI/ML. + * + * :0: no preemption timeout (default) + * + * _`GUC_KLV_VF_CFG_THRESHOLD_CAT_ERR` : 0x8A03 + * This config sets threshold for CAT errors caused by the VF. + * + * :0: adverse events or error will not be reported (default) + * :n: event occurrence count per sampling interval + * + * _`GUC_KLV_VF_CFG_THRESHOLD_ENGINE_RESET` : 0x8A04 + * This config sets threshold for engine reset caused by the VF. + * + * :0: adverse events or error will not be reported (default) + * :n: event occurrence count per sampling interval + * + * _`GUC_KLV_VF_CFG_THRESHOLD_PAGE_FAULT` : 0x8A05 + * This config sets threshold for page fault errors caused by the VF. + * + * :0: adverse events or error will not be reported (default) + * :n: event occurrence count per sampling interval + * + * _`GUC_KLV_VF_CFG_THRESHOLD_H2G_STORM` : 0x8A06 + * This config sets threshold for H2G interrupts triggered by the VF. + * + * :0: adverse events or error will not be reported (default) + * :n: time (us) per sampling interval + * + * _`GUC_KLV_VF_CFG_THRESHOLD_IRQ_STORM` : 0x8A07 + * This config sets threshold for GT interrupts triggered by the VF's + * workloads. + * + * :0: adverse events or error will not be reported (default) + * :n: time (us) per sampling interval + * + * _`GUC_KLV_VF_CFG_THRESHOLD_DOORBELL_STORM` : 0x8A08 + * This config sets threshold for doorbell's ring triggered by the VF. + * + * :0: adverse events or error will not be reported (default) + * :n: time (us) per sampling interval + * + * _`GUC_KLV_VF_CFG_BEGIN_DOORBELL_ID` : 0x8A0A + * Refers to the start index of doorbell assigned to this VF. + * + * :0: (default) + * :1-255: number of doorbells (Gen12) + * + * _`GUC_KLV_VF_CFG_BEGIN_CONTEXT_ID` : 0x8A0B + * Refers to the start index in context array allocated to this VF’s use. + * + * :0: (default) + * :1-65535: number of contexts (Gen12) + */ + +#define GUC_KLV_VF_CFG_GGTT_START_KEY 0x0001 +#define GUC_KLV_VF_CFG_GGTT_START_LEN 2u + +#define GUC_KLV_VF_CFG_GGTT_SIZE_KEY 0x0002 +#define GUC_KLV_VF_CFG_GGTT_SIZE_LEN 2u + +#define GUC_KLV_VF_CFG_LMEM_SIZE_KEY 0x0003 +#define GUC_KLV_VF_CFG_LMEM_SIZE_LEN 2u + +#define GUC_KLV_VF_CFG_NUM_CONTEXTS_KEY 0x0004 +#define GUC_KLV_VF_CFG_NUM_CONTEXTS_LEN 1u + +#define GUC_KLV_VF_CFG_TILE_MASK_KEY 0x0005 +#define GUC_KLV_VF_CFG_TILE_MASK_LEN 1u + +#define GUC_KLV_VF_CFG_NUM_DOORBELLS_KEY 0x0006 +#define GUC_KLV_VF_CFG_NUM_DOORBELLS_LEN 1u + +#define GUC_KLV_VF_CFG_EXEC_QUANTUM_KEY 0x8a01 +#define GUC_KLV_VF_CFG_EXEC_QUANTUM_LEN 1u + +#define GUC_KLV_VF_CFG_PREEMPT_TIMEOUT_KEY 0x8a02 +#define GUC_KLV_VF_CFG_PREEMPT_TIMEOUT_LEN 1u + +#define GUC_KLV_VF_CFG_THRESHOLD_CAT_ERR_KEY 0x8a03 +#define GUC_KLV_VF_CFG_THRESHOLD_CAT_ERR_LEN 1u + +#define GUC_KLV_VF_CFG_THRESHOLD_ENGINE_RESET_KEY 0x8a04 +#define GUC_KLV_VF_CFG_THRESHOLD_ENGINE_RESET_LEN 1u + +#define GUC_KLV_VF_CFG_THRESHOLD_PAGE_FAULT_KEY 0x8a05 +#define GUC_KLV_VF_CFG_THRESHOLD_PAGE_FAULT_LEN 1u + +#define GUC_KLV_VF_CFG_THRESHOLD_H2G_STORM_KEY 0x8a06 +#define GUC_KLV_VF_CFG_THRESHOLD_H2G_STORM_LEN 1u + +#define GUC_KLV_VF_CFG_THRESHOLD_IRQ_STORM_KEY 0x8a07 +#define GUC_KLV_VF_CFG_THRESHOLD_IRQ_STORM_LEN 1u + +#define GUC_KLV_VF_CFG_THRESHOLD_DOORBELL_STORM_KEY 0x8a08 +#define GUC_KLV_VF_CFG_THRESHOLD_DOORBELL_STORM_LEN 1u + +#define GUC_KLV_VF_CFG_BEGIN_DOORBELL_ID_KEY 0x8a0a +#define GUC_KLV_VF_CFG_BEGIN_DOORBELL_ID_LEN 1u + +#define GUC_KLV_VF_CFG_BEGIN_CONTEXT_ID_KEY 0x8a0b +#define GUC_KLV_VF_CFG_BEGIN_CONTEXT_ID_LEN 1u + +#endif diff --git a/drivers/gpu/drm/xe/abi/guc_messages_abi.h b/drivers/gpu/drm/xe/abi/guc_messages_abi.h new file mode 100644 index 000000000000..3d199016cf88 --- /dev/null +++ b/drivers/gpu/drm/xe/abi/guc_messages_abi.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2014-2021 Intel Corporation + */ + +#ifndef _ABI_GUC_MESSAGES_ABI_H +#define _ABI_GUC_MESSAGES_ABI_H + +/** + * DOC: HXG Message + * + * All messages exchanged with GuC are defined using 32 bit dwords. + * First dword is treated as a message header. Remaining dwords are optional. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | | | | + * | 0 | 31 | **ORIGIN** - originator of the message | + * | | | - _`GUC_HXG_ORIGIN_HOST` = 0 | + * | | | - _`GUC_HXG_ORIGIN_GUC` = 1 | + * | | | | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | **TYPE** - message type | + * | | | - _`GUC_HXG_TYPE_REQUEST` = 0 | + * | | | - _`GUC_HXG_TYPE_EVENT` = 1 | + * | | | - _`GUC_HXG_TYPE_NO_RESPONSE_BUSY` = 3 | + * | | | - _`GUC_HXG_TYPE_NO_RESPONSE_RETRY` = 5 | + * | | | - _`GUC_HXG_TYPE_RESPONSE_FAILURE` = 6 | + * | | | - _`GUC_HXG_TYPE_RESPONSE_SUCCESS` = 7 | + * | +-------+--------------------------------------------------------------+ + * | | 27:0 | **AUX** - auxiliary data (depends on TYPE) | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | | + * +---+-------+ | + * |...| | **PAYLOAD** - optional payload (depends on TYPE) | + * +---+-------+ | + * | n | 31:0 | | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_HXG_MSG_MIN_LEN 1u +#define GUC_HXG_MSG_0_ORIGIN (0x1 << 31) +#define GUC_HXG_ORIGIN_HOST 0u +#define GUC_HXG_ORIGIN_GUC 1u +#define GUC_HXG_MSG_0_TYPE (0x7 << 28) +#define GUC_HXG_TYPE_REQUEST 0u +#define GUC_HXG_TYPE_EVENT 1u +#define GUC_HXG_TYPE_NO_RESPONSE_BUSY 3u +#define GUC_HXG_TYPE_NO_RESPONSE_RETRY 5u +#define GUC_HXG_TYPE_RESPONSE_FAILURE 6u +#define GUC_HXG_TYPE_RESPONSE_SUCCESS 7u +#define GUC_HXG_MSG_0_AUX (0xfffffff << 0) +#define GUC_HXG_MSG_n_PAYLOAD (0xffffffff << 0) + +/** + * DOC: HXG Request + * + * The `HXG Request`_ message should be used to initiate synchronous activity + * for which confirmation or return data is expected. + * + * The recipient of this message shall use `HXG Response`_, `HXG Failure`_ + * or `HXG Retry`_ message as a definite reply, and may use `HXG Busy`_ + * message as a intermediate reply. + * + * Format of @DATA0 and all @DATAn fields depends on the @ACTION code. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_REQUEST_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:16 | **DATA0** - request data (depends on ACTION) | + * | +-------+--------------------------------------------------------------+ + * | | 15:0 | **ACTION** - requested action code | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | | + * +---+-------+ | + * |...| | **DATAn** - optional data (depends on ACTION) | + * +---+-------+ | + * | n | 31:0 | | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_HXG_REQUEST_MSG_MIN_LEN GUC_HXG_MSG_MIN_LEN +#define GUC_HXG_REQUEST_MSG_0_DATA0 (0xfff << 16) +#define GUC_HXG_REQUEST_MSG_0_ACTION (0xffff << 0) +#define GUC_HXG_REQUEST_MSG_n_DATAn GUC_HXG_MSG_n_PAYLOAD + +/** + * DOC: HXG Event + * + * The `HXG Event`_ message should be used to initiate asynchronous activity + * that does not involves immediate confirmation nor data. + * + * Format of @DATA0 and all @DATAn fields depends on the @ACTION code. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_EVENT_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:16 | **DATA0** - event data (depends on ACTION) | + * | +-------+--------------------------------------------------------------+ + * | | 15:0 | **ACTION** - event action code | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | | + * +---+-------+ | + * |...| | **DATAn** - optional event data (depends on ACTION) | + * +---+-------+ | + * | n | 31:0 | | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_HXG_EVENT_MSG_MIN_LEN GUC_HXG_MSG_MIN_LEN +#define GUC_HXG_EVENT_MSG_0_DATA0 (0xfff << 16) +#define GUC_HXG_EVENT_MSG_0_ACTION (0xffff << 0) +#define GUC_HXG_EVENT_MSG_n_DATAn GUC_HXG_MSG_n_PAYLOAD + +/** + * DOC: HXG Busy + * + * The `HXG Busy`_ message may be used to acknowledge reception of the `HXG Request`_ + * message if the recipient expects that it processing will be longer than default + * timeout. + * + * The @COUNTER field may be used as a progress indicator. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_NO_RESPONSE_BUSY_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:0 | **COUNTER** - progress indicator | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_HXG_BUSY_MSG_LEN GUC_HXG_MSG_MIN_LEN +#define GUC_HXG_BUSY_MSG_0_COUNTER GUC_HXG_MSG_0_AUX + +/** + * DOC: HXG Retry + * + * The `HXG Retry`_ message should be used by recipient to indicate that the + * `HXG Request`_ message was dropped and it should be resent again. + * + * The @REASON field may be used to provide additional information. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_NO_RESPONSE_RETRY_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:0 | **REASON** - reason for retry | + * | | | - _`GUC_HXG_RETRY_REASON_UNSPECIFIED` = 0 | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_HXG_RETRY_MSG_LEN GUC_HXG_MSG_MIN_LEN +#define GUC_HXG_RETRY_MSG_0_REASON GUC_HXG_MSG_0_AUX +#define GUC_HXG_RETRY_REASON_UNSPECIFIED 0u + +/** + * DOC: HXG Failure + * + * The `HXG Failure`_ message shall be used as a reply to the `HXG Request`_ + * message that could not be processed due to an error. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_RESPONSE_FAILURE_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:16 | **HINT** - additional error hint | + * | +-------+--------------------------------------------------------------+ + * | | 15:0 | **ERROR** - error/result code | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_HXG_FAILURE_MSG_LEN GUC_HXG_MSG_MIN_LEN +#define GUC_HXG_FAILURE_MSG_0_HINT (0xfff << 16) +#define GUC_HXG_FAILURE_MSG_0_ERROR (0xffff << 0) + +/** + * DOC: HXG Response + * + * The `HXG Response`_ message shall be used as a reply to the `HXG Request`_ + * message that was successfully processed without an error. + * + * +---+-------+--------------------------------------------------------------+ + * | | Bits | Description | + * +===+=======+==============================================================+ + * | 0 | 31 | ORIGIN | + * | +-------+--------------------------------------------------------------+ + * | | 30:28 | TYPE = GUC_HXG_TYPE_RESPONSE_SUCCESS_ | + * | +-------+--------------------------------------------------------------+ + * | | 27:0 | **DATA0** - data (depends on ACTION from `HXG Request`_) | + * +---+-------+--------------------------------------------------------------+ + * | 1 | 31:0 | | + * +---+-------+ | + * |...| | **DATAn** - data (depends on ACTION from `HXG Request`_) | + * +---+-------+ | + * | n | 31:0 | | + * +---+-------+--------------------------------------------------------------+ + */ + +#define GUC_HXG_RESPONSE_MSG_MIN_LEN GUC_HXG_MSG_MIN_LEN +#define GUC_HXG_RESPONSE_MSG_0_DATA0 GUC_HXG_MSG_0_AUX +#define GUC_HXG_RESPONSE_MSG_n_DATAn GUC_HXG_MSG_n_PAYLOAD + +/* deprecated */ +#define INTEL_GUC_MSG_TYPE_SHIFT 28 +#define INTEL_GUC_MSG_TYPE_MASK (0xF << INTEL_GUC_MSG_TYPE_SHIFT) +#define INTEL_GUC_MSG_DATA_SHIFT 16 +#define INTEL_GUC_MSG_DATA_MASK (0xFFF << INTEL_GUC_MSG_DATA_SHIFT) +#define INTEL_GUC_MSG_CODE_SHIFT 0 +#define INTEL_GUC_MSG_CODE_MASK (0xFFFF << INTEL_GUC_MSG_CODE_SHIFT) + +enum intel_guc_msg_type { + INTEL_GUC_MSG_TYPE_REQUEST = 0x0, + INTEL_GUC_MSG_TYPE_RESPONSE = 0xF, +}; + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_lmem.h b/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_lmem.h new file mode 100644 index 000000000000..710cecca972d --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_lmem.h @@ -0,0 +1 @@ +/* Empty */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_mman.h b/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_mman.h new file mode 100644 index 000000000000..650ea2803a97 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_mman.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _I915_GEM_MMAN_H_ +#define _I915_GEM_MMAN_H_ + +#include "xe_bo_types.h" +#include <drm/drm_prime.h> + +static inline int i915_gem_fb_mmap(struct xe_bo *bo, struct vm_area_struct *vma) +{ + return drm_gem_prime_mmap(&bo->ttm.base, vma); +} + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_object.h b/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_object.h new file mode 100644 index 000000000000..5f19550cc845 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_object.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _I915_GEM_OBJECT_H_ +#define _I915_GEM_OBJECT_H_ + +#include <linux/types.h> + +#include "xe_bo.h" + +#define i915_gem_object_is_shmem(obj) ((obj)->flags & XE_BO_CREATE_SYSTEM_BIT) + +static inline dma_addr_t i915_gem_object_get_dma_address(const struct xe_bo *bo, pgoff_t n) +{ + /* Should never be called */ + WARN_ON(1); + return n; +} + +static inline bool i915_gem_object_is_tiled(const struct xe_bo *bo) +{ + /* legacy tiling is unused */ + return false; +} + +static inline bool i915_gem_object_is_userptr(const struct xe_bo *bo) +{ + /* legacy tiling is unused */ + return false; +} + +static inline int i915_gem_object_read_from_page(struct xe_bo *bo, + u32 ofs, u64 *ptr, u32 size) +{ + struct ttm_bo_kmap_obj map; + void *virtual; + bool is_iomem; + int ret; + + XE_WARN_ON(size != 8); + + ret = xe_bo_lock(bo, true); + if (ret) + return ret; + + ret = ttm_bo_kmap(&bo->ttm, ofs >> PAGE_SHIFT, 1, &map); + if (ret) + goto out_unlock; + + ofs &= ~PAGE_MASK; + virtual = ttm_kmap_obj_virtual(&map, &is_iomem); + if (is_iomem) + *ptr = readq((void __iomem *)(virtual + ofs)); + else + *ptr = *(u64 *)(virtual + ofs); + + ttm_bo_kunmap(&map); +out_unlock: + xe_bo_unlock(bo); + return ret; +} + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_object_frontbuffer.h b/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_object_frontbuffer.h new file mode 100644 index 000000000000..2a3f12d2978c --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/gem/i915_gem_object_frontbuffer.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _I915_GEM_OBJECT_FRONTBUFFER_H_ +#define _I915_GEM_OBJECT_FRONTBUFFER_H_ + +#define i915_gem_object_get_frontbuffer(obj) NULL +#define i915_gem_object_set_frontbuffer(obj, front) (front) + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/gt/intel_rps.h b/drivers/gpu/drm/xe/compat-i915-headers/gt/intel_rps.h new file mode 100644 index 000000000000..21fec9cc837c --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/gt/intel_rps.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __INTEL_RPS_H__ +#define __INTEL_RPS_H__ + +#define gen5_rps_irq_handler(x) ({}) + +#endif /* __INTEL_RPS_H__ */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_active.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_active.h new file mode 100644 index 000000000000..6f0ab3753563 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_active.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _I915_ACTIVE_H_ +#define _I915_ACTIVE_H_ + +#include "i915_active_types.h" + +static inline void i915_active_init(struct i915_active *ref, + int (*active)(struct i915_active *ref), + void (*retire)(struct i915_active *ref), + unsigned long flags) +{ + (void) active; + (void) retire; +} + +#define i915_active_fini(active) do { } while (0) + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_active_types.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_active_types.h new file mode 100644 index 000000000000..8c31f9a8b168 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_active_types.h @@ -0,0 +1,13 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef _I915_ACTIVE_TYPES_H_ +#define _I915_ACTIVE_TYPES_H_ + +struct i915_active {}; +#define I915_ACTIVE_RETIRE_SLEEPS 0 + +#endif /* _I915_ACTIVE_TYPES_H_ */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_config.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_config.h new file mode 100644 index 000000000000..e835bea08d1b --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_config.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __I915_CONFIG_H__ +#define __I915_CONFIG_H__ + +#include <linux/sched.h> + +struct drm_i915_private; + +static inline unsigned long +i915_fence_timeout(const struct drm_i915_private *i915) +{ + return MAX_SCHEDULE_TIMEOUT; +} + +#endif /* __I915_CONFIG_H__ */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_debugfs.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_debugfs.h new file mode 100644 index 000000000000..b4c47617b64b --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_debugfs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __I915_DEBUGFS_H__ +#define __I915_DEBUGFS_H__ + +struct drm_i915_gem_object; +struct seq_file; + +static inline void i915_debugfs_describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj) {} + +#endif /* __I915_DEBUGFS_H__ */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_drv.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_drv.h new file mode 100644 index 000000000000..5d2a77b52db4 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_drv.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ +#ifndef _XE_I915_DRV_H_ +#define _XE_I915_DRV_H_ + +/* + * "Adaptation header" to allow i915 display to also build for xe driver. + * TODO: refactor i915 and xe so this can cease to exist + */ + +#include <drm/drm_drv.h> + +#include "gem/i915_gem_object.h" + +#include "soc/intel_pch.h" +#include "xe_device.h" +#include "xe_bo.h" +#include "xe_pm.h" +#include "xe_step.h" +#include "i915_gem.h" +#include "i915_gem_stolen.h" +#include "i915_gpu_error.h" +#include "i915_reg_defs.h" +#include "i915_utils.h" +#include "intel_gt_types.h" +#include "intel_step.h" +#include "intel_uc_fw.h" +#include "intel_uncore.h" +#include "intel_runtime_pm.h" +#include <linux/pm_runtime.h> + +static inline struct drm_i915_private *to_i915(const struct drm_device *dev) +{ + return container_of(dev, struct drm_i915_private, drm); +} + +static inline struct drm_i915_private *kdev_to_i915(struct device *kdev) +{ + return dev_get_drvdata(kdev); +} + + +#define INTEL_JASPERLAKE 0 +#define INTEL_ELKHARTLAKE 0 +#define IS_PLATFORM(xe, x) ((xe)->info.platform == x) +#define INTEL_INFO(dev_priv) (&((dev_priv)->info)) +#define INTEL_DEVID(dev_priv) ((dev_priv)->info.devid) +#define IS_I830(dev_priv) (dev_priv && 0) +#define IS_I845G(dev_priv) (dev_priv && 0) +#define IS_I85X(dev_priv) (dev_priv && 0) +#define IS_I865G(dev_priv) (dev_priv && 0) +#define IS_I915G(dev_priv) (dev_priv && 0) +#define IS_I915GM(dev_priv) (dev_priv && 0) +#define IS_I945G(dev_priv) (dev_priv && 0) +#define IS_I945GM(dev_priv) (dev_priv && 0) +#define IS_I965G(dev_priv) (dev_priv && 0) +#define IS_I965GM(dev_priv) (dev_priv && 0) +#define IS_G45(dev_priv) (dev_priv && 0) +#define IS_GM45(dev_priv) (dev_priv && 0) +#define IS_G4X(dev_priv) (dev_priv && 0) +#define IS_PINEVIEW(dev_priv) (dev_priv && 0) +#define IS_G33(dev_priv) (dev_priv && 0) +#define IS_IRONLAKE(dev_priv) (dev_priv && 0) +#define IS_IRONLAKE_M(dev_priv) (dev_priv && 0) +#define IS_SANDYBRIDGE(dev_priv) (dev_priv && 0) +#define IS_IVYBRIDGE(dev_priv) (dev_priv && 0) +#define IS_IVB_GT1(dev_priv) (dev_priv && 0) +#define IS_VALLEYVIEW(dev_priv) (dev_priv && 0) +#define IS_CHERRYVIEW(dev_priv) (dev_priv && 0) +#define IS_HASWELL(dev_priv) (dev_priv && 0) +#define IS_BROADWELL(dev_priv) (dev_priv && 0) +#define IS_SKYLAKE(dev_priv) (dev_priv && 0) +#define IS_BROXTON(dev_priv) (dev_priv && 0) +#define IS_KABYLAKE(dev_priv) (dev_priv && 0) +#define IS_GEMINILAKE(dev_priv) (dev_priv && 0) +#define IS_COFFEELAKE(dev_priv) (dev_priv && 0) +#define IS_COMETLAKE(dev_priv) (dev_priv && 0) +#define IS_ICELAKE(dev_priv) (dev_priv && 0) +#define IS_JASPERLAKE(dev_priv) (dev_priv && 0) +#define IS_ELKHARTLAKE(dev_priv) (dev_priv && 0) +#define IS_TIGERLAKE(dev_priv) IS_PLATFORM(dev_priv, XE_TIGERLAKE) +#define IS_ROCKETLAKE(dev_priv) IS_PLATFORM(dev_priv, XE_ROCKETLAKE) +#define IS_DG1(dev_priv) IS_PLATFORM(dev_priv, XE_DG1) +#define IS_ALDERLAKE_S(dev_priv) IS_PLATFORM(dev_priv, XE_ALDERLAKE_S) +#define IS_ALDERLAKE_P(dev_priv) IS_PLATFORM(dev_priv, XE_ALDERLAKE_P) +#define IS_XEHPSDV(dev_priv) (dev_priv && 0) +#define IS_DG2(dev_priv) IS_PLATFORM(dev_priv, XE_DG2) +#define IS_PONTEVECCHIO(dev_priv) IS_PLATFORM(dev_priv, XE_PVC) +#define IS_METEORLAKE(dev_priv) IS_PLATFORM(dev_priv, XE_METEORLAKE) +#define IS_LUNARLAKE(dev_priv) IS_PLATFORM(dev_priv, XE_LUNARLAKE) + +#define IS_HASWELL_ULT(dev_priv) (dev_priv && 0) +#define IS_BROADWELL_ULT(dev_priv) (dev_priv && 0) +#define IS_BROADWELL_ULX(dev_priv) (dev_priv && 0) + +#define IP_VER(ver, rel) ((ver) << 8 | (rel)) + +#define INTEL_DISPLAY_ENABLED(xe) (HAS_DISPLAY((xe)) && !intel_opregion_headless_sku((xe))) + +#define IS_GRAPHICS_VER(xe, first, last) \ + ((xe)->info.graphics_verx100 >= first * 100 && \ + (xe)->info.graphics_verx100 <= (last*100 + 99)) +#define IS_MOBILE(xe) (xe && 0) +#define HAS_LLC(xe) (!IS_DGFX((xe))) + +#define HAS_GMD_ID(xe) GRAPHICS_VERx100(xe) >= 1270 + +/* Workarounds not handled yet */ +#define IS_DISPLAY_STEP(xe, first, last) ({u8 __step = (xe)->info.step.display; first <= __step && __step <= last; }) +#define IS_GRAPHICS_STEP(xe, first, last) ({u8 __step = (xe)->info.step.graphics; first <= __step && __step <= last; }) + +#define IS_LP(xe) (0) +#define IS_GEN9_LP(xe) (0) +#define IS_GEN9_BC(xe) (0) + +#define IS_TIGERLAKE_UY(xe) (xe && 0) +#define IS_COMETLAKE_ULX(xe) (xe && 0) +#define IS_COFFEELAKE_ULX(xe) (xe && 0) +#define IS_KABYLAKE_ULX(xe) (xe && 0) +#define IS_SKYLAKE_ULX(xe) (xe && 0) +#define IS_HASWELL_ULX(xe) (xe && 0) +#define IS_COMETLAKE_ULT(xe) (xe && 0) +#define IS_COFFEELAKE_ULT(xe) (xe && 0) +#define IS_KABYLAKE_ULT(xe) (xe && 0) +#define IS_SKYLAKE_ULT(xe) (xe && 0) + +#define IS_DG1_GRAPHICS_STEP(xe, first, last) (IS_DG1(xe) && IS_GRAPHICS_STEP(xe, first, last)) +#define IS_DG2_GRAPHICS_STEP(xe, variant, first, last) \ + ((xe)->info.subplatform == XE_SUBPLATFORM_DG2_ ## variant && \ + IS_GRAPHICS_STEP(xe, first, last)) +#define IS_XEHPSDV_GRAPHICS_STEP(xe, first, last) (IS_XEHPSDV(xe) && IS_GRAPHICS_STEP(xe, first, last)) + +/* XXX: No basedie stepping support yet */ +#define IS_PVC_BD_STEP(xe, first, last) (!WARN_ON(1) && IS_PONTEVECCHIO(xe)) + +#define IS_TIGERLAKE_DISPLAY_STEP(xe, first, last) (IS_TIGERLAKE(xe) && IS_DISPLAY_STEP(xe, first, last)) +#define IS_ROCKETLAKE_DISPLAY_STEP(xe, first, last) (IS_ROCKETLAKE(xe) && IS_DISPLAY_STEP(xe, first, last)) +#define IS_DG1_DISPLAY_STEP(xe, first, last) (IS_DG1(xe) && IS_DISPLAY_STEP(xe, first, last)) +#define IS_DG2_DISPLAY_STEP(xe, first, last) (IS_DG2(xe) && IS_DISPLAY_STEP(xe, first, last)) +#define IS_ADLP_DISPLAY_STEP(xe, first, last) (IS_ALDERLAKE_P(xe) && IS_DISPLAY_STEP(xe, first, last)) +#define IS_ADLS_DISPLAY_STEP(xe, first, last) (IS_ALDERLAKE_S(xe) && IS_DISPLAY_STEP(xe, first, last)) +#define IS_JSL_EHL_DISPLAY_STEP(xe, first, last) (IS_JSL_EHL(xe) && IS_DISPLAY_STEP(xe, first, last)) +#define IS_MTL_DISPLAY_STEP(xe, first, last) (IS_METEORLAKE(xe) && IS_DISPLAY_STEP(xe, first, last)) + +/* FIXME: Add subplatform here */ +#define IS_MTL_GRAPHICS_STEP(xe, sub, first, last) (IS_METEORLAKE(xe) && IS_DISPLAY_STEP(xe, first, last)) + +#define IS_DG2_G10(xe) ((xe)->info.subplatform == XE_SUBPLATFORM_DG2_G10) +#define IS_DG2_G11(xe) ((xe)->info.subplatform == XE_SUBPLATFORM_DG2_G11) +#define IS_DG2_G12(xe) ((xe)->info.subplatform == XE_SUBPLATFORM_DG2_G12) +#define IS_RAPTORLAKE_U(xe) ((xe)->info.subplatform == XE_SUBPLATFORM_ALDERLAKE_P_RPLU) +#define IS_ICL_WITH_PORT_F(xe) (xe && 0) +#define HAS_FLAT_CCS(xe) (xe_device_has_flat_ccs(xe)) +#define to_intel_bo(x) gem_to_xe_bo((x)) +#define mkwrite_device_info(xe) (INTEL_INFO(xe)) + +#define HAS_128_BYTE_Y_TILING(xe) (xe || 1) + +#define intel_has_gpu_reset(a) (a && 0) + +#include "intel_wakeref.h" + +static inline bool intel_runtime_pm_get(struct xe_runtime_pm *pm) +{ + struct xe_device *xe = container_of(pm, struct xe_device, runtime_pm); + + if (xe_pm_runtime_get(xe) < 0) { + xe_pm_runtime_put(xe); + return false; + } + return true; +} + +static inline bool intel_runtime_pm_get_if_in_use(struct xe_runtime_pm *pm) +{ + struct xe_device *xe = container_of(pm, struct xe_device, runtime_pm); + + return xe_pm_runtime_get_if_active(xe); +} + +static inline void intel_runtime_pm_put_unchecked(struct xe_runtime_pm *pm) +{ + struct xe_device *xe = container_of(pm, struct xe_device, runtime_pm); + + xe_pm_runtime_put(xe); +} + +static inline void intel_runtime_pm_put(struct xe_runtime_pm *pm, bool wakeref) +{ + if (wakeref) + intel_runtime_pm_put_unchecked(pm); +} + +#define intel_runtime_pm_get_raw intel_runtime_pm_get +#define intel_runtime_pm_put_raw intel_runtime_pm_put +#define assert_rpm_wakelock_held(x) do { } while (0) +#define assert_rpm_raw_wakeref_held(x) do { } while (0) + +#define intel_uncore_forcewake_get(x, y) do { } while (0) +#define intel_uncore_forcewake_put(x, y) do { } while (0) + +#define intel_uncore_arm_unclaimed_mmio_detection(x) do { } while (0) + +#define I915_PRIORITY_DISPLAY 0 +struct i915_sched_attr { + int priority; +}; +#define i915_gem_fence_wait_priority(fence, attr) do { (void) attr; } while (0) + +#define with_intel_runtime_pm(rpm, wf) \ + for ((wf) = intel_runtime_pm_get(rpm); (wf); \ + intel_runtime_pm_put((rpm), (wf)), (wf) = 0) + +#define pdev_to_i915 pdev_to_xe_device +#define RUNTIME_INFO(xe) (&(xe)->info.i915_runtime) + +#define FORCEWAKE_ALL XE_FORCEWAKE_ALL +#define HPD_STORM_DEFAULT_THRESHOLD 50 + +#ifdef CONFIG_ARM64 +/* + * arm64 indirectly includes linux/rtc.h, + * which defines a irq_lock, so include it + * here before #define-ing it + */ +#include <linux/rtc.h> +#endif + +#define irq_lock irq.lock + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_fixed.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_fixed.h new file mode 100644 index 000000000000..12c671fd5235 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_fixed.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../i915/i915_fixed.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_gem.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_gem.h new file mode 100644 index 000000000000..06b723a479c5 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_gem.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __I915_GEM_H__ +#define __I915_GEM_H__ +#define GEM_BUG_ON +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_gem_stolen.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_gem_stolen.h new file mode 100644 index 000000000000..888e7a87a925 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_gem_stolen.h @@ -0,0 +1,79 @@ +#ifndef _I915_GEM_STOLEN_H_ +#define _I915_GEM_STOLEN_H_ + +#include "xe_ttm_stolen_mgr.h" +#include "xe_res_cursor.h" + +struct xe_bo; + +struct i915_stolen_fb { + struct xe_bo *bo; +}; + +static inline int i915_gem_stolen_insert_node_in_range(struct xe_device *xe, + struct i915_stolen_fb *fb, + u32 size, u32 align, + u32 start, u32 end) +{ + struct xe_bo *bo; + int err; + u32 flags = XE_BO_CREATE_PINNED_BIT | XE_BO_CREATE_STOLEN_BIT; + + bo = xe_bo_create_locked_range(xe, xe_device_get_root_tile(xe), + NULL, size, start, end, + ttm_bo_type_kernel, flags); + if (IS_ERR(bo)) { + err = PTR_ERR(bo); + bo = NULL; + return err; + } + err = xe_bo_pin(bo); + xe_bo_unlock_vm_held(bo); + + if (err) { + xe_bo_put(fb->bo); + bo = NULL; + } + + fb->bo = bo; + + return err; +} + +static inline int i915_gem_stolen_insert_node(struct xe_device *xe, + struct i915_stolen_fb *fb, + u32 size, u32 align) +{ + /* Not used on xe */ + BUG_ON(1); + return -ENODEV; +} + +static inline void i915_gem_stolen_remove_node(struct xe_device *xe, + struct i915_stolen_fb *fb) +{ + xe_bo_unpin_map_no_vm(fb->bo); + fb->bo = NULL; +} + +#define i915_gem_stolen_initialized(xe) (!!ttm_manager_type(&(xe)->ttm, XE_PL_STOLEN)) +#define i915_gem_stolen_node_allocated(fb) (!!((fb)->bo)) + +static inline u32 i915_gem_stolen_node_offset(struct i915_stolen_fb *fb) +{ + struct xe_res_cursor res; + + xe_res_first(fb->bo->ttm.resource, 0, 4096, &res); + return res.start; +} + +/* Used for < gen4. These are not supported by Xe */ +#define i915_gem_stolen_area_address(xe) (!WARN_ON(1)) +/* Used for gen9 specific WA. Gen9 is not supported by Xe */ +#define i915_gem_stolen_area_size(xe) (!WARN_ON(1)) + +#define i915_gem_stolen_node_address(xe, fb) (xe_ttm_stolen_gpu_offset(xe) + \ + i915_gem_stolen_node_offset(fb)) +#define i915_gem_stolen_node_size(fb) ((u64)((fb)->bo->ttm.base.size)) + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_gpu_error.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_gpu_error.h new file mode 100644 index 000000000000..98e9dd78f670 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_gpu_error.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _I915_GPU_ERROR_H_ +#define _I915_GPU_ERROR_H_ + +struct drm_i915_error_state_buf; + +__printf(2, 3) +static inline void +i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) +{ +} + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_irq.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_irq.h new file mode 100644 index 000000000000..61707a07f91f --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_irq.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../i915/i915_irq.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_reg.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_reg.h new file mode 100644 index 000000000000..8619ec015ad4 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_reg.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../i915/i915_reg.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_reg_defs.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_reg_defs.h new file mode 100644 index 000000000000..723279c975b1 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_reg_defs.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../i915/i915_reg_defs.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_trace.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_trace.h new file mode 100644 index 000000000000..d429d421ac70 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_trace.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#define trace_i915_reg_rw(a...) do { } while (0) diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_utils.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_utils.h new file mode 100644 index 000000000000..1d7c4360e5c0 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_utils.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../i915/i915_utils.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_vgpu.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_vgpu.h new file mode 100644 index 000000000000..80b024d435dc --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_vgpu.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _I915_VGPU_H_ +#define _I915_VGPU_H_ + +#include <linux/types.h> + +struct drm_i915_private; +struct i915_ggtt; + +static inline void intel_vgpu_detect(struct drm_i915_private *i915) +{ +} +static inline bool intel_vgpu_active(struct drm_i915_private *i915) +{ + return false; +} +static inline void intel_vgpu_register(struct drm_i915_private *i915) +{ +} +static inline bool intel_vgpu_has_full_ppgtt(struct drm_i915_private *i915) +{ + return false; +} +static inline bool intel_vgpu_has_hwsp_emulation(struct drm_i915_private *i915) +{ + return false; +} +static inline bool intel_vgpu_has_huge_gtt(struct drm_i915_private *i915) +{ + return false; +} +static inline int intel_vgt_balloon(struct i915_ggtt *ggtt) +{ + return 0; +} +static inline void intel_vgt_deballoon(struct i915_ggtt *ggtt) +{ +} + +#endif /* _I915_VGPU_H_ */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_vma.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_vma.h new file mode 100644 index 000000000000..a20d2638ea7a --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_vma.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef I915_VMA_H +#define I915_VMA_H + +#include <uapi/drm/i915_drm.h> +#include <drm/drm_mm.h> + +/* We don't want these from i915_drm.h in case of Xe */ +#undef I915_TILING_X +#undef I915_TILING_Y +#define I915_TILING_X 0 +#define I915_TILING_Y 0 + +struct xe_bo; + +struct i915_vma { + struct xe_bo *bo, *dpt; + struct drm_mm_node node; +}; + +#define i915_ggtt_clear_scanout(bo) do { } while (0) + +#define i915_vma_fence_id(vma) -1 + +static inline u32 i915_ggtt_offset(const struct i915_vma *vma) +{ + return vma->node.start; +} + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/i915_vma_types.h b/drivers/gpu/drm/xe/compat-i915-headers/i915_vma_types.h new file mode 100644 index 000000000000..e7aaf50f5485 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/i915_vma_types.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/types.h> +#include <linux/build_bug.h> + +/* XX: Figure out how to handle this vma mapping in xe */ +struct intel_remapped_plane_info { + /* in gtt pages */ + u32 offset:31; + u32 linear:1; + union { + /* in gtt pages for !linear */ + struct { + u16 width; + u16 height; + u16 src_stride; + u16 dst_stride; + }; + + /* in gtt pages for linear */ + u32 size; + }; +} __packed; + +struct intel_remapped_info { + struct intel_remapped_plane_info plane[4]; + /* in gtt pages */ + u32 plane_alignment; +} __packed; + +struct intel_rotation_info { + struct intel_remapped_plane_info plane[2]; +} __packed; + +enum i915_gtt_view_type { + I915_GTT_VIEW_NORMAL = 0, + I915_GTT_VIEW_ROTATED = sizeof(struct intel_rotation_info), + I915_GTT_VIEW_REMAPPED = sizeof(struct intel_remapped_info), +}; + +static inline void assert_i915_gem_gtt_types(void) +{ + BUILD_BUG_ON(sizeof(struct intel_rotation_info) != 2 * sizeof(u32) + 8 * sizeof(u16)); + BUILD_BUG_ON(sizeof(struct intel_remapped_info) != 5 * sizeof(u32) + 16 * sizeof(u16)); + + /* Check that rotation/remapped shares offsets for simplicity */ + BUILD_BUG_ON(offsetof(struct intel_remapped_info, plane[0]) != + offsetof(struct intel_rotation_info, plane[0])); + BUILD_BUG_ON(offsetofend(struct intel_remapped_info, plane[1]) != + offsetofend(struct intel_rotation_info, plane[1])); + + /* As we encode the size of each branch inside the union into its type, + * we have to be careful that each branch has a unique size. + */ + switch ((enum i915_gtt_view_type)0) { + case I915_GTT_VIEW_NORMAL: + case I915_GTT_VIEW_ROTATED: + case I915_GTT_VIEW_REMAPPED: + /* gcc complains if these are identical cases */ + break; + } +} + +struct i915_gtt_view { + enum i915_gtt_view_type type; + union { + /* Members need to contain no holes/padding */ + struct intel_rotation_info rotated; + struct intel_remapped_info remapped; + }; +}; diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_clock_gating.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_clock_gating.h new file mode 100644 index 000000000000..ce986f0e8f38 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_clock_gating.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../i915/intel_clock_gating.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_gt_types.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_gt_types.h new file mode 100644 index 000000000000..c15806d6c4f7 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_gt_types.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __INTEL_GT_TYPES__ +#define __INTEL_GT_TYPES__ + +#define intel_gt_support_legacy_fencing(gt) 0 + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_mchbar_regs.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_mchbar_regs.h new file mode 100644 index 000000000000..55b316985340 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_mchbar_regs.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../i915/intel_mchbar_regs.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_pci_config.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_pci_config.h new file mode 100644 index 000000000000..8c15867fd613 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_pci_config.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../i915/intel_pci_config.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_pcode.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_pcode.h new file mode 100644 index 000000000000..0c47661bdc6a --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_pcode.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __INTEL_PCODE_H__ +#define __INTEL_PCODE_H__ + +#include "intel_uncore.h" +#include "xe_pcode.h" + +static inline int +snb_pcode_write_timeout(struct intel_uncore *uncore, u32 mbox, u32 val, + int fast_timeout_us, int slow_timeout_ms) +{ + return xe_pcode_write_timeout(__compat_uncore_to_gt(uncore), mbox, val, + slow_timeout_ms ?: 1); +} + +static inline int +snb_pcode_write(struct intel_uncore *uncore, u32 mbox, u32 val) +{ + + return xe_pcode_write(__compat_uncore_to_gt(uncore), mbox, val); +} + +static inline int +snb_pcode_read(struct intel_uncore *uncore, u32 mbox, u32 *val, u32 *val1) +{ + return xe_pcode_read(__compat_uncore_to_gt(uncore), mbox, val, val1); +} + +static inline int +skl_pcode_request(struct intel_uncore *uncore, u32 mbox, + u32 request, u32 reply_mask, u32 reply, + int timeout_base_ms) +{ + return xe_pcode_request(__compat_uncore_to_gt(uncore), mbox, request, reply_mask, reply, + timeout_base_ms); +} + +#endif /* __INTEL_PCODE_H__ */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_runtime_pm.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_runtime_pm.h new file mode 100644 index 000000000000..89da3cc62f39 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_runtime_pm.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "intel_wakeref.h" + +#define intel_runtime_pm xe_runtime_pm + +static inline void disable_rpm_wakeref_asserts(void *rpm) +{ +} + +static inline void enable_rpm_wakeref_asserts(void *rpm) +{ +} diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_step.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_step.h new file mode 100644 index 000000000000..0006ef812346 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_step.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __INTEL_STEP_H__ +#define __INTEL_STEP_H__ + +#include "xe_device_types.h" +#include "xe_step.h" + +#define intel_display_step_name xe_display_step_name + +static inline +const char *xe_display_step_name(struct xe_device *xe) +{ + return xe_step_name(xe->info.step.display); +} + +#endif /* __INTEL_STEP_H__ */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_uc_fw.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_uc_fw.h new file mode 100644 index 000000000000..009745328992 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_uc_fw.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _INTEL_UC_FW_H_ +#define _INTEL_UC_FW_H_ + +#define INTEL_UC_FIRMWARE_URL "https://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git" + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_uncore.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_uncore.h new file mode 100644 index 000000000000..cd26ddc0f69e --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_uncore.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __INTEL_UNCORE_H__ +#define __INTEL_UNCORE_H__ + +#include "xe_device.h" +#include "xe_device_types.h" +#include "xe_mmio.h" + +static inline struct xe_gt *__compat_uncore_to_gt(struct intel_uncore *uncore) +{ + struct xe_device *xe = container_of(uncore, struct xe_device, uncore); + + return xe_root_mmio_gt(xe); +} + +static inline u32 intel_uncore_read(struct intel_uncore *uncore, + i915_reg_t i915_reg) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + return xe_mmio_read32(__compat_uncore_to_gt(uncore), reg); +} + +static inline u32 intel_uncore_read8(struct intel_uncore *uncore, + i915_reg_t i915_reg) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + return xe_mmio_read8(__compat_uncore_to_gt(uncore), reg); +} + +static inline u32 intel_uncore_read16(struct intel_uncore *uncore, + i915_reg_t i915_reg) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + return xe_mmio_read16(__compat_uncore_to_gt(uncore), reg); +} + +static inline u64 +intel_uncore_read64_2x32(struct intel_uncore *uncore, + i915_reg_t i915_lower_reg, i915_reg_t i915_upper_reg) +{ + struct xe_reg lower_reg = XE_REG(i915_mmio_reg_offset(i915_lower_reg)); + struct xe_reg upper_reg = XE_REG(i915_mmio_reg_offset(i915_upper_reg)); + u32 upper, lower, old_upper; + int loop = 0; + + upper = xe_mmio_read32(__compat_uncore_to_gt(uncore), upper_reg); + do { + old_upper = upper; + lower = xe_mmio_read32(__compat_uncore_to_gt(uncore), lower_reg); + upper = xe_mmio_read32(__compat_uncore_to_gt(uncore), upper_reg); + } while (upper != old_upper && loop++ < 2); + + return (u64)upper << 32 | lower; +} + +static inline void intel_uncore_posting_read(struct intel_uncore *uncore, + i915_reg_t i915_reg) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + xe_mmio_read32(__compat_uncore_to_gt(uncore), reg); +} + +static inline void intel_uncore_write(struct intel_uncore *uncore, + i915_reg_t i915_reg, u32 val) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + xe_mmio_write32(__compat_uncore_to_gt(uncore), reg, val); +} + +static inline u32 intel_uncore_rmw(struct intel_uncore *uncore, + i915_reg_t i915_reg, u32 clear, u32 set) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + return xe_mmio_rmw32(__compat_uncore_to_gt(uncore), reg, clear, set); +} + +static inline int intel_wait_for_register(struct intel_uncore *uncore, + i915_reg_t i915_reg, u32 mask, + u32 value, unsigned int timeout) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + return xe_mmio_wait32(__compat_uncore_to_gt(uncore), reg, mask, value, + timeout * USEC_PER_MSEC, NULL, false); +} + +static inline int intel_wait_for_register_fw(struct intel_uncore *uncore, + i915_reg_t i915_reg, u32 mask, + u32 value, unsigned int timeout) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + return xe_mmio_wait32(__compat_uncore_to_gt(uncore), reg, mask, value, + timeout * USEC_PER_MSEC, NULL, false); +} + +static inline int +__intel_wait_for_register(struct intel_uncore *uncore, i915_reg_t i915_reg, + u32 mask, u32 value, unsigned int fast_timeout_us, + unsigned int slow_timeout_ms, u32 *out_value) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + return xe_mmio_wait32(__compat_uncore_to_gt(uncore), reg, mask, value, + fast_timeout_us + 1000 * slow_timeout_ms, + out_value, false); +} + +static inline u32 intel_uncore_read_fw(struct intel_uncore *uncore, + i915_reg_t i915_reg) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + return xe_mmio_read32(__compat_uncore_to_gt(uncore), reg); +} + +static inline void intel_uncore_write_fw(struct intel_uncore *uncore, + i915_reg_t i915_reg, u32 val) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + xe_mmio_write32(__compat_uncore_to_gt(uncore), reg, val); +} + +static inline u32 intel_uncore_read_notrace(struct intel_uncore *uncore, + i915_reg_t i915_reg) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + return xe_mmio_read32(__compat_uncore_to_gt(uncore), reg); +} + +static inline void intel_uncore_write_notrace(struct intel_uncore *uncore, + i915_reg_t i915_reg, u32 val) +{ + struct xe_reg reg = XE_REG(i915_mmio_reg_offset(i915_reg)); + + xe_mmio_write32(__compat_uncore_to_gt(uncore), reg, val); +} + +static inline void __iomem *intel_uncore_regs(struct intel_uncore *uncore) +{ + struct xe_device *xe = container_of(uncore, struct xe_device, uncore); + + return xe_device_get_root_tile(xe)->mmio.regs; +} + +/* + * The raw_reg_{read,write} macros are intended as a micro-optimization for + * interrupt handlers so that the pointer indirection on uncore->regs can + * be computed once (and presumably cached in a register) instead of generating + * extra load instructions for each MMIO access. + * + * Given that these macros are only intended for non-GSI interrupt registers + * (and the goal is to avoid extra instructions generated by the compiler), + * these macros do not account for uncore->gsi_offset. Any caller that needs + * to use these macros on a GSI register is responsible for adding the + * appropriate GSI offset to the 'base' parameter. + */ +#define raw_reg_read(base, reg) \ + readl(base + i915_mmio_reg_offset(reg)) +#define raw_reg_write(base, reg, value) \ + writel(value, base + i915_mmio_reg_offset(reg)) + +#endif /* __INTEL_UNCORE_H__ */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/intel_wakeref.h b/drivers/gpu/drm/xe/compat-i915-headers/intel_wakeref.h new file mode 100644 index 000000000000..ecb1c0707706 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/intel_wakeref.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/types.h> + +typedef unsigned long intel_wakeref_t; diff --git a/drivers/gpu/drm/xe/compat-i915-headers/pxp/intel_pxp.h b/drivers/gpu/drm/xe/compat-i915-headers/pxp/intel_pxp.h new file mode 100644 index 000000000000..c2c30ece8f77 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/pxp/intel_pxp.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __INTEL_PXP_H__ +#define __INTEL_PXP_H__ + +#include <linux/errno.h> +#include <linux/types.h> + +struct drm_i915_gem_object; +struct intel_pxp; + +static inline int intel_pxp_key_check(struct intel_pxp *pxp, + struct drm_i915_gem_object *obj, + bool assign) +{ + return -ENODEV; +} + +static inline bool +i915_gem_object_is_protected(const struct drm_i915_gem_object *obj) +{ + return false; +} + +#endif diff --git a/drivers/gpu/drm/xe/compat-i915-headers/soc/intel_dram.h b/drivers/gpu/drm/xe/compat-i915-headers/soc/intel_dram.h new file mode 100644 index 000000000000..65707e20c557 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/soc/intel_dram.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../../i915/soc/intel_dram.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/soc/intel_gmch.h b/drivers/gpu/drm/xe/compat-i915-headers/soc/intel_gmch.h new file mode 100644 index 000000000000..33c5257b3a71 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/soc/intel_gmch.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../../i915/soc/intel_gmch.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/soc/intel_pch.h b/drivers/gpu/drm/xe/compat-i915-headers/soc/intel_pch.h new file mode 100644 index 000000000000..9c46556d33a4 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/soc/intel_pch.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../../i915/soc/intel_pch.h" diff --git a/drivers/gpu/drm/xe/compat-i915-headers/vlv_sideband.h b/drivers/gpu/drm/xe/compat-i915-headers/vlv_sideband.h new file mode 100644 index 000000000000..ec6f12de5727 --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/vlv_sideband.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2013-2021 Intel Corporation + */ + +#ifndef _VLV_SIDEBAND_H_ +#define _VLV_SIDEBAND_H_ + +#include <linux/types.h> + +#include "vlv_sideband_reg.h" + +enum pipe; +struct drm_i915_private; + +enum { + VLV_IOSF_SB_BUNIT, + VLV_IOSF_SB_CCK, + VLV_IOSF_SB_CCU, + VLV_IOSF_SB_DPIO, + VLV_IOSF_SB_FLISDSI, + VLV_IOSF_SB_GPIO, + VLV_IOSF_SB_NC, + VLV_IOSF_SB_PUNIT, +}; + +static inline void vlv_iosf_sb_get(struct drm_i915_private *i915, unsigned long ports) +{ +} +static inline u32 vlv_iosf_sb_read(struct drm_i915_private *i915, u8 port, u32 reg) +{ + return 0; +} +static inline void vlv_iosf_sb_write(struct drm_i915_private *i915, + u8 port, u32 reg, u32 val) +{ +} +static inline void vlv_iosf_sb_put(struct drm_i915_private *i915, unsigned long ports) +{ +} +static inline void vlv_bunit_get(struct drm_i915_private *i915) +{ +} +static inline u32 vlv_bunit_read(struct drm_i915_private *i915, u32 reg) +{ + return 0; +} +static inline void vlv_bunit_write(struct drm_i915_private *i915, u32 reg, u32 val) +{ +} +static inline void vlv_bunit_put(struct drm_i915_private *i915) +{ +} +static inline void vlv_cck_get(struct drm_i915_private *i915) +{ +} +static inline u32 vlv_cck_read(struct drm_i915_private *i915, u32 reg) +{ + return 0; +} +static inline void vlv_cck_write(struct drm_i915_private *i915, u32 reg, u32 val) +{ +} +static inline void vlv_cck_put(struct drm_i915_private *i915) +{ +} +static inline void vlv_ccu_get(struct drm_i915_private *i915) +{ +} +static inline u32 vlv_ccu_read(struct drm_i915_private *i915, u32 reg) +{ + return 0; +} +static inline void vlv_ccu_write(struct drm_i915_private *i915, u32 reg, u32 val) +{ +} +static inline void vlv_ccu_put(struct drm_i915_private *i915) +{ +} +static inline void vlv_dpio_get(struct drm_i915_private *i915) +{ +} +static inline u32 vlv_dpio_read(struct drm_i915_private *i915, int pipe, int reg) +{ + return 0; +} +static inline void vlv_dpio_write(struct drm_i915_private *i915, + int pipe, int reg, u32 val) +{ +} +static inline void vlv_dpio_put(struct drm_i915_private *i915) +{ +} +static inline void vlv_flisdsi_get(struct drm_i915_private *i915) +{ +} +static inline u32 vlv_flisdsi_read(struct drm_i915_private *i915, u32 reg) +{ + return 0; +} +static inline void vlv_flisdsi_write(struct drm_i915_private *i915, u32 reg, u32 val) +{ +} +static inline void vlv_flisdsi_put(struct drm_i915_private *i915) +{ +} +static inline void vlv_nc_get(struct drm_i915_private *i915) +{ +} +static inline u32 vlv_nc_read(struct drm_i915_private *i915, u8 addr) +{ + return 0; +} +static inline void vlv_nc_put(struct drm_i915_private *i915) +{ +} +static inline void vlv_punit_get(struct drm_i915_private *i915) +{ +} +static inline u32 vlv_punit_read(struct drm_i915_private *i915, u32 addr) +{ + return 0; +} +static inline int vlv_punit_write(struct drm_i915_private *i915, u32 addr, u32 val) +{ + return 0; +} +static inline void vlv_punit_put(struct drm_i915_private *i915) +{ +} + +#endif /* _VLV_SIDEBAND_H_ */ diff --git a/drivers/gpu/drm/xe/compat-i915-headers/vlv_sideband_reg.h b/drivers/gpu/drm/xe/compat-i915-headers/vlv_sideband_reg.h new file mode 100644 index 000000000000..949f134ce3cf --- /dev/null +++ b/drivers/gpu/drm/xe/compat-i915-headers/vlv_sideband_reg.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "../../i915/vlv_sideband_reg.h" diff --git a/drivers/gpu/drm/xe/display/ext/i915_irq.c b/drivers/gpu/drm/xe/display/ext/i915_irq.c new file mode 100644 index 000000000000..bee191a4a97d --- /dev/null +++ b/drivers/gpu/drm/xe/display/ext/i915_irq.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "i915_drv.h" +#include "i915_irq.h" +#include "i915_reg.h" +#include "intel_uncore.h" + +void gen3_irq_reset(struct intel_uncore *uncore, i915_reg_t imr, + i915_reg_t iir, i915_reg_t ier) +{ + intel_uncore_write(uncore, imr, 0xffffffff); + intel_uncore_posting_read(uncore, imr); + + intel_uncore_write(uncore, ier, 0); + + /* IIR can theoretically queue up two events. Be paranoid. */ + intel_uncore_write(uncore, iir, 0xffffffff); + intel_uncore_posting_read(uncore, iir); + intel_uncore_write(uncore, iir, 0xffffffff); + intel_uncore_posting_read(uncore, iir); +} + +/* + * We should clear IMR at preinstall/uninstall, and just check at postinstall. + */ +void gen3_assert_iir_is_zero(struct intel_uncore *uncore, i915_reg_t reg) +{ + struct xe_device *xe = container_of(uncore, struct xe_device, uncore); + u32 val = intel_uncore_read(uncore, reg); + + if (val == 0) + return; + + drm_WARN(&xe->drm, 1, + "Interrupt register 0x%x is not zero: 0x%08x\n", + i915_mmio_reg_offset(reg), val); + intel_uncore_write(uncore, reg, 0xffffffff); + intel_uncore_posting_read(uncore, reg); + intel_uncore_write(uncore, reg, 0xffffffff); + intel_uncore_posting_read(uncore, reg); +} + +void gen3_irq_init(struct intel_uncore *uncore, + i915_reg_t imr, u32 imr_val, + i915_reg_t ier, u32 ier_val, + i915_reg_t iir) +{ + gen3_assert_iir_is_zero(uncore, iir); + + intel_uncore_write(uncore, ier, ier_val); + intel_uncore_write(uncore, imr, imr_val); + intel_uncore_posting_read(uncore, imr); +} + +bool intel_irqs_enabled(struct xe_device *xe) +{ + /* + * XXX: i915 has a racy handling of the irq.enabled, since it doesn't + * lock its transitions. Because of that, the irq.enabled sometimes + * is not read with the irq.lock in place. + * However, the most critical cases like vblank and page flips are + * properly using the locks. + * We cannot take the lock in here or run any kind of assert because + * of i915 inconsistency. + * But at this point the xe irq is better protected against races, + * although the full solution would be protecting the i915 side. + */ + return xe->irq.enabled; +} + +void intel_synchronize_irq(struct xe_device *xe) +{ + synchronize_irq(to_pci_dev(xe->drm.dev)->irq); +} diff --git a/drivers/gpu/drm/xe/display/ext/i915_utils.c b/drivers/gpu/drm/xe/display/ext/i915_utils.c new file mode 100644 index 000000000000..43b10a2cc508 --- /dev/null +++ b/drivers/gpu/drm/xe/display/ext/i915_utils.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "i915_drv.h" + +bool i915_vtd_active(struct drm_i915_private *i915) +{ + if (device_iommu_mapped(i915->drm.dev)) + return true; + + /* Running as a guest, we assume the host is enforcing VT'd */ + return i915_run_as_guest(); +} + +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG) + +/* i915 specific, just put here for shutting it up */ +int __i915_inject_probe_error(struct drm_i915_private *i915, int err, + const char *func, int line) +{ + return 0; +} + +#endif diff --git a/drivers/gpu/drm/xe/display/intel_fb_bo.c b/drivers/gpu/drm/xe/display/intel_fb_bo.c new file mode 100644 index 000000000000..b21da7b745a5 --- /dev/null +++ b/drivers/gpu/drm/xe/display/intel_fb_bo.c @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#include <drm/drm_modeset_helper.h> + +#include "i915_drv.h" +#include "intel_display_types.h" +#include "intel_fb_bo.h" + +void intel_fb_bo_framebuffer_fini(struct xe_bo *bo) +{ + if (bo->flags & XE_BO_CREATE_PINNED_BIT) { + /* Unpin our kernel fb first */ + xe_bo_lock(bo, false); + xe_bo_unpin(bo); + xe_bo_unlock(bo); + } + xe_bo_put(bo); +} + +int intel_fb_bo_framebuffer_init(struct intel_framebuffer *intel_fb, + struct xe_bo *bo, + struct drm_mode_fb_cmd2 *mode_cmd) +{ + struct drm_i915_private *i915 = to_i915(bo->ttm.base.dev); + int ret; + + xe_bo_get(bo); + + ret = ttm_bo_reserve(&bo->ttm, true, false, NULL); + if (ret) + return ret; + + if (!(bo->flags & XE_BO_SCANOUT_BIT)) { + /* + * XE_BO_SCANOUT_BIT should ideally be set at creation, or is + * automatically set when creating FB. We cannot change caching + * mode when the boect is VM_BINDed, so we can only set + * coherency with display when unbound. + */ + if (XE_IOCTL_DBG(i915, !list_empty(&bo->ttm.base.gpuva.list))) { + ttm_bo_unreserve(&bo->ttm); + return -EINVAL; + } + bo->flags |= XE_BO_SCANOUT_BIT; + } + ttm_bo_unreserve(&bo->ttm); + + return ret; +} + +struct xe_bo *intel_fb_bo_lookup_valid_bo(struct drm_i915_private *i915, + struct drm_file *filp, + const struct drm_mode_fb_cmd2 *mode_cmd) +{ + struct drm_i915_gem_object *bo; + struct drm_gem_object *gem = drm_gem_object_lookup(filp, mode_cmd->handles[0]); + + if (!gem) + return ERR_PTR(-ENOENT); + + bo = gem_to_xe_bo(gem); + /* Require vram placement or dma-buf import */ + if (IS_DGFX(i915) && + !xe_bo_can_migrate(gem_to_xe_bo(gem), XE_PL_VRAM0) && + bo->ttm.type != ttm_bo_type_sg) { + drm_gem_object_put(gem); + return ERR_PTR(-EREMOTE); + } + + return bo; +} diff --git a/drivers/gpu/drm/xe/display/intel_fb_bo.h b/drivers/gpu/drm/xe/display/intel_fb_bo.h new file mode 100644 index 000000000000..5d365b925b7a --- /dev/null +++ b/drivers/gpu/drm/xe/display/intel_fb_bo.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef __INTEL_FB_BO_H__ +#define __INTEL_FB_BO_H__ + +struct drm_file; +struct drm_mode_fb_cmd2; +struct drm_i915_private; +struct intel_framebuffer; +struct xe_bo; + +void intel_fb_bo_framebuffer_fini(struct xe_bo *bo); +int intel_fb_bo_framebuffer_init(struct intel_framebuffer *intel_fb, + struct xe_bo *bo, + struct drm_mode_fb_cmd2 *mode_cmd); + +struct xe_bo *intel_fb_bo_lookup_valid_bo(struct drm_i915_private *i915, + struct drm_file *filp, + const struct drm_mode_fb_cmd2 *mode_cmd); + +#endif diff --git a/drivers/gpu/drm/xe/display/intel_fbdev_fb.c b/drivers/gpu/drm/xe/display/intel_fbdev_fb.c new file mode 100644 index 000000000000..51ae3561fd0d --- /dev/null +++ b/drivers/gpu/drm/xe/display/intel_fbdev_fb.c @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#include "intel_fbdev_fb.h" + +#include <drm/drm_fb_helper.h> + +#include "xe_gt.h" +#include "xe_ttm_stolen_mgr.h" + +#include "i915_drv.h" +#include "intel_display_types.h" + +struct drm_framebuffer *intel_fbdev_fb_alloc(struct drm_fb_helper *helper, + struct drm_fb_helper_surface_size *sizes) +{ + struct drm_framebuffer *fb; + struct drm_device *dev = helper->dev; + struct drm_i915_private *dev_priv = to_i915(dev); + struct drm_mode_fb_cmd2 mode_cmd = {}; + struct drm_i915_gem_object *obj; + int size; + + /* we don't do packed 24bpp */ + if (sizes->surface_bpp == 24) + sizes->surface_bpp = 32; + + mode_cmd.width = sizes->surface_width; + mode_cmd.height = sizes->surface_height; + + mode_cmd.pitches[0] = ALIGN(mode_cmd.width * + DIV_ROUND_UP(sizes->surface_bpp, 8), XE_PAGE_SIZE); + mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp, + sizes->surface_depth); + + size = mode_cmd.pitches[0] * mode_cmd.height; + size = PAGE_ALIGN(size); + obj = ERR_PTR(-ENODEV); + + if (!IS_DGFX(dev_priv)) { + obj = xe_bo_create_pin_map(dev_priv, xe_device_get_root_tile(dev_priv), + NULL, size, + ttm_bo_type_kernel, XE_BO_SCANOUT_BIT | + XE_BO_CREATE_STOLEN_BIT | + XE_BO_CREATE_PINNED_BIT); + if (!IS_ERR(obj)) + drm_info(&dev_priv->drm, "Allocated fbdev into stolen\n"); + else + drm_info(&dev_priv->drm, "Allocated fbdev into stolen failed: %li\n", PTR_ERR(obj)); + } + if (IS_ERR(obj)) { + obj = xe_bo_create_pin_map(dev_priv, xe_device_get_root_tile(dev_priv), NULL, size, + ttm_bo_type_kernel, XE_BO_SCANOUT_BIT | + XE_BO_CREATE_VRAM_IF_DGFX(xe_device_get_root_tile(dev_priv)) | + XE_BO_CREATE_PINNED_BIT); + } + + if (IS_ERR(obj)) { + drm_err(&dev_priv->drm, "failed to allocate framebuffer (%pe)\n", obj); + fb = ERR_PTR(-ENOMEM); + goto err; + } + + fb = intel_framebuffer_create(obj, &mode_cmd); + if (IS_ERR(fb)) { + xe_bo_unpin_map_no_vm(obj); + goto err; + } + + drm_gem_object_put(intel_bo_to_drm_bo(obj)); + return fb; + +err: + return fb; +} + +int intel_fbdev_fb_fill_info(struct drm_i915_private *i915, struct fb_info *info, + struct drm_i915_gem_object *obj, struct i915_vma *vma) +{ + struct pci_dev *pdev = to_pci_dev(i915->drm.dev); + + if (!(obj->flags & XE_BO_CREATE_SYSTEM_BIT)) { + if (obj->flags & XE_BO_CREATE_STOLEN_BIT) + info->fix.smem_start = xe_ttm_stolen_io_offset(obj, 0); + else + info->fix.smem_start = + pci_resource_start(pdev, 2) + + xe_bo_addr(obj, 0, XE_PAGE_SIZE); + + info->fix.smem_len = obj->ttm.base.size; + } else { + /* XXX: Pure fiction, as the BO may not be physically accessible.. */ + info->fix.smem_start = 0; + info->fix.smem_len = obj->ttm.base.size; + } + XE_WARN_ON(iosys_map_is_null(&obj->vmap)); + + info->screen_base = obj->vmap.vaddr_iomem; + info->screen_size = intel_bo_to_drm_bo(obj)->size; + + return 0; +} diff --git a/drivers/gpu/drm/xe/display/intel_fbdev_fb.h b/drivers/gpu/drm/xe/display/intel_fbdev_fb.h new file mode 100644 index 000000000000..ea186772e0bb --- /dev/null +++ b/drivers/gpu/drm/xe/display/intel_fbdev_fb.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __INTEL_FBDEV_FB_H__ +#define __INTEL_FBDEV_FB_H__ + +struct drm_fb_helper; +struct drm_fb_helper_surface_size; +struct drm_i915_gem_object; +struct drm_i915_private; +struct fb_info; +struct i915_vma; + +struct drm_framebuffer *intel_fbdev_fb_alloc(struct drm_fb_helper *helper, + struct drm_fb_helper_surface_size *sizes); +int intel_fbdev_fb_fill_info(struct drm_i915_private *i915, struct fb_info *info, + struct drm_i915_gem_object *obj, struct i915_vma *vma); + +#endif diff --git a/drivers/gpu/drm/xe/display/xe_display_misc.c b/drivers/gpu/drm/xe/display/xe_display_misc.c new file mode 100644 index 000000000000..242c2ef4ca93 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_display_misc.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "intel_display_types.h" + +struct pci_dev; + +unsigned int intel_gmch_vga_set_decode(struct pci_dev *pdev, bool enable_decode); + +unsigned int intel_gmch_vga_set_decode(struct pci_dev *pdev, bool enable_decode) +{ + /* ToDo: Implement the actual handling of vga decode */ + return 0; +} diff --git a/drivers/gpu/drm/xe/display/xe_display_rps.c b/drivers/gpu/drm/xe/display/xe_display_rps.c new file mode 100644 index 000000000000..ab21c581c192 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_display_rps.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "intel_display_rps.h" + +void intel_display_rps_boost_after_vblank(struct drm_crtc *crtc, + struct dma_fence *fence) +{ +} + +void intel_display_rps_mark_interactive(struct drm_i915_private *i915, + struct intel_atomic_state *state, + bool interactive) +{ +} diff --git a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c new file mode 100644 index 000000000000..27c2fb1c002a --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2023, Intel Corporation. + */ + +#include "i915_drv.h" +#include "i915_vma.h" +#include "intel_display_types.h" +#include "intel_dsb_buffer.h" +#include "xe_bo.h" +#include "xe_gt.h" + +u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf) +{ + return xe_bo_ggtt_addr(dsb_buf->vma->bo); +} + +void intel_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val) +{ + iosys_map_wr(&dsb_buf->vma->bo->vmap, idx * 4, u32, val); +} + +u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) +{ + return iosys_map_rd(&dsb_buf->vma->bo->vmap, idx * 4, u32); +} + +void intel_dsb_buffer_memset(struct intel_dsb_buffer *dsb_buf, u32 idx, u32 val, size_t size) +{ + WARN_ON(idx > (dsb_buf->buf_size - size) / sizeof(*dsb_buf->cmd_buf)); + + iosys_map_memset(&dsb_buf->vma->bo->vmap, idx * 4, val, size); +} + +bool intel_dsb_buffer_create(struct intel_crtc *crtc, struct intel_dsb_buffer *dsb_buf, size_t size) +{ + struct drm_i915_private *i915 = to_i915(crtc->base.dev); + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + + vma = kzalloc(sizeof(*vma), GFP_KERNEL); + if (!vma) + return false; + + obj = xe_bo_create_pin_map(i915, xe_device_get_root_tile(i915), + NULL, PAGE_ALIGN(size), + ttm_bo_type_kernel, + XE_BO_CREATE_VRAM_IF_DGFX(xe_device_get_root_tile(i915)) | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(obj)) { + kfree(vma); + return false; + } + + vma->bo = obj; + dsb_buf->vma = vma; + dsb_buf->buf_size = size; + + return true; +} + +void intel_dsb_buffer_cleanup(struct intel_dsb_buffer *dsb_buf) +{ + xe_bo_unpin_map_no_vm(dsb_buf->vma->bo); + kfree(dsb_buf->vma); +} + +void intel_dsb_buffer_flush_map(struct intel_dsb_buffer *dsb_buf) +{ + /* TODO: add xe specific flush_map() for dsb buffer object. */ +} diff --git a/drivers/gpu/drm/xe/display/xe_fb_pin.c b/drivers/gpu/drm/xe/display/xe_fb_pin.c new file mode 100644 index 000000000000..722c84a56607 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_fb_pin.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_display_types.h" +#include "intel_dpt.h" +#include "intel_fb.h" +#include "intel_fb_pin.h" +#include "xe_ggtt.h" +#include "xe_gt.h" + +#include <drm/ttm/ttm_bo.h> + +static void +write_dpt_rotated(struct xe_bo *bo, struct iosys_map *map, u32 *dpt_ofs, u32 bo_ofs, + u32 width, u32 height, u32 src_stride, u32 dst_stride) +{ + struct xe_device *xe = xe_bo_device(bo); + struct xe_ggtt *ggtt = xe_device_get_root_tile(xe)->mem.ggtt; + u32 column, row; + + /* TODO: Maybe rewrite so we can traverse the bo addresses sequentially, + * by writing dpt/ggtt in a different order? + */ + + for (column = 0; column < width; column++) { + u32 src_idx = src_stride * (height - 1) + column + bo_ofs; + + for (row = 0; row < height; row++) { + u64 pte = ggtt->pt_ops->pte_encode_bo(bo, src_idx * XE_PAGE_SIZE, + xe->pat.idx[XE_CACHE_WB]); + + iosys_map_wr(map, *dpt_ofs, u64, pte); + *dpt_ofs += 8; + src_idx -= src_stride; + } + + /* The DE ignores the PTEs for the padding tiles */ + *dpt_ofs += (dst_stride - height) * 8; + } + + /* Align to next page */ + *dpt_ofs = ALIGN(*dpt_ofs, 4096); +} + +static void +write_dpt_remapped(struct xe_bo *bo, struct iosys_map *map, u32 *dpt_ofs, + u32 bo_ofs, u32 width, u32 height, u32 src_stride, + u32 dst_stride) +{ + struct xe_device *xe = xe_bo_device(bo); + struct xe_ggtt *ggtt = xe_device_get_root_tile(xe)->mem.ggtt; + u64 (*pte_encode_bo)(struct xe_bo *bo, u64 bo_offset, u16 pat_index) + = ggtt->pt_ops->pte_encode_bo; + u32 column, row; + + for (row = 0; row < height; row++) { + u32 src_idx = src_stride * row + bo_ofs; + + for (column = 0; column < width; column++) { + iosys_map_wr(map, *dpt_ofs, u64, + pte_encode_bo(bo, src_idx * XE_PAGE_SIZE, + xe->pat.idx[XE_CACHE_WB])); + + *dpt_ofs += 8; + src_idx++; + } + + /* The DE ignores the PTEs for the padding tiles */ + *dpt_ofs += (dst_stride - width) * 8; + } + + /* Align to next page */ + *dpt_ofs = ALIGN(*dpt_ofs, 4096); +} + +static int __xe_pin_fb_vma_dpt(struct intel_framebuffer *fb, + const struct i915_gtt_view *view, + struct i915_vma *vma) +{ + struct xe_device *xe = to_xe_device(fb->base.dev); + struct xe_tile *tile0 = xe_device_get_root_tile(xe); + struct xe_ggtt *ggtt = tile0->mem.ggtt; + struct xe_bo *bo = intel_fb_obj(&fb->base), *dpt; + u32 dpt_size, size = bo->ttm.base.size; + + if (view->type == I915_GTT_VIEW_NORMAL) + dpt_size = ALIGN(size / XE_PAGE_SIZE * 8, XE_PAGE_SIZE); + else if (view->type == I915_GTT_VIEW_REMAPPED) + dpt_size = ALIGN(intel_remapped_info_size(&fb->remapped_view.gtt.remapped) * 8, + XE_PAGE_SIZE); + else + /* display uses 4K tiles instead of bytes here, convert to entries.. */ + dpt_size = ALIGN(intel_rotation_info_size(&view->rotated) * 8, + XE_PAGE_SIZE); + + if (IS_DGFX(xe)) + dpt = xe_bo_create_pin_map(xe, tile0, NULL, dpt_size, + ttm_bo_type_kernel, + XE_BO_CREATE_VRAM0_BIT | + XE_BO_CREATE_GGTT_BIT); + else + dpt = xe_bo_create_pin_map(xe, tile0, NULL, dpt_size, + ttm_bo_type_kernel, + XE_BO_CREATE_STOLEN_BIT | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(dpt)) + dpt = xe_bo_create_pin_map(xe, tile0, NULL, dpt_size, + ttm_bo_type_kernel, + XE_BO_CREATE_SYSTEM_BIT | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(dpt)) + return PTR_ERR(dpt); + + if (view->type == I915_GTT_VIEW_NORMAL) { + u32 x; + + for (x = 0; x < size / XE_PAGE_SIZE; x++) { + u64 pte = ggtt->pt_ops->pte_encode_bo(bo, x * XE_PAGE_SIZE, + xe->pat.idx[XE_CACHE_WB]); + + iosys_map_wr(&dpt->vmap, x * 8, u64, pte); + } + } else if (view->type == I915_GTT_VIEW_REMAPPED) { + const struct intel_remapped_info *remap_info = &view->remapped; + u32 i, dpt_ofs = 0; + + for (i = 0; i < ARRAY_SIZE(remap_info->plane); i++) + write_dpt_remapped(bo, &dpt->vmap, &dpt_ofs, + remap_info->plane[i].offset, + remap_info->plane[i].width, + remap_info->plane[i].height, + remap_info->plane[i].src_stride, + remap_info->plane[i].dst_stride); + + } else { + const struct intel_rotation_info *rot_info = &view->rotated; + u32 i, dpt_ofs = 0; + + for (i = 0; i < ARRAY_SIZE(rot_info->plane); i++) + write_dpt_rotated(bo, &dpt->vmap, &dpt_ofs, + rot_info->plane[i].offset, + rot_info->plane[i].width, + rot_info->plane[i].height, + rot_info->plane[i].src_stride, + rot_info->plane[i].dst_stride); + } + + vma->dpt = dpt; + vma->node = dpt->ggtt_node; + return 0; +} + +static void +write_ggtt_rotated(struct xe_bo *bo, struct xe_ggtt *ggtt, u32 *ggtt_ofs, u32 bo_ofs, + u32 width, u32 height, u32 src_stride, u32 dst_stride) +{ + struct xe_device *xe = xe_bo_device(bo); + u32 column, row; + + for (column = 0; column < width; column++) { + u32 src_idx = src_stride * (height - 1) + column + bo_ofs; + + for (row = 0; row < height; row++) { + u64 pte = ggtt->pt_ops->pte_encode_bo(bo, src_idx * XE_PAGE_SIZE, + xe->pat.idx[XE_CACHE_WB]); + + xe_ggtt_set_pte(ggtt, *ggtt_ofs, pte); + *ggtt_ofs += XE_PAGE_SIZE; + src_idx -= src_stride; + } + + /* The DE ignores the PTEs for the padding tiles */ + *ggtt_ofs += (dst_stride - height) * XE_PAGE_SIZE; + } +} + +static int __xe_pin_fb_vma_ggtt(struct intel_framebuffer *fb, + const struct i915_gtt_view *view, + struct i915_vma *vma) +{ + struct xe_bo *bo = intel_fb_obj(&fb->base); + struct xe_device *xe = to_xe_device(fb->base.dev); + struct xe_ggtt *ggtt = xe_device_get_root_tile(xe)->mem.ggtt; + u32 align; + int ret; + + /* TODO: Consider sharing framebuffer mapping? + * embed i915_vma inside intel_framebuffer + */ + xe_device_mem_access_get(tile_to_xe(ggtt->tile)); + ret = mutex_lock_interruptible(&ggtt->lock); + if (ret) + goto out; + + align = XE_PAGE_SIZE; + if (xe_bo_is_vram(bo) && ggtt->flags & XE_GGTT_FLAGS_64K) + align = max_t(u32, align, SZ_64K); + + if (bo->ggtt_node.size && view->type == I915_GTT_VIEW_NORMAL) { + vma->node = bo->ggtt_node; + } else if (view->type == I915_GTT_VIEW_NORMAL) { + u32 x, size = bo->ttm.base.size; + + ret = xe_ggtt_insert_special_node_locked(ggtt, &vma->node, size, + align, 0); + if (ret) + goto out_unlock; + + for (x = 0; x < size; x += XE_PAGE_SIZE) { + u64 pte = ggtt->pt_ops->pte_encode_bo(bo, x, + xe->pat.idx[XE_CACHE_WB]); + + xe_ggtt_set_pte(ggtt, vma->node.start + x, pte); + } + } else { + u32 i, ggtt_ofs; + const struct intel_rotation_info *rot_info = &view->rotated; + + /* display seems to use tiles instead of bytes here, so convert it back.. */ + u32 size = intel_rotation_info_size(rot_info) * XE_PAGE_SIZE; + + ret = xe_ggtt_insert_special_node_locked(ggtt, &vma->node, size, + align, 0); + if (ret) + goto out_unlock; + + ggtt_ofs = vma->node.start; + + for (i = 0; i < ARRAY_SIZE(rot_info->plane); i++) + write_ggtt_rotated(bo, ggtt, &ggtt_ofs, + rot_info->plane[i].offset, + rot_info->plane[i].width, + rot_info->plane[i].height, + rot_info->plane[i].src_stride, + rot_info->plane[i].dst_stride); + } + + xe_ggtt_invalidate(ggtt); +out_unlock: + mutex_unlock(&ggtt->lock); +out: + xe_device_mem_access_put(tile_to_xe(ggtt->tile)); + return ret; +} + +static struct i915_vma *__xe_pin_fb_vma(struct intel_framebuffer *fb, + const struct i915_gtt_view *view) +{ + struct drm_device *dev = fb->base.dev; + struct xe_device *xe = to_xe_device(dev); + struct i915_vma *vma = kzalloc(sizeof(*vma), GFP_KERNEL); + struct xe_bo *bo = intel_fb_obj(&fb->base); + int ret; + + if (!vma) + return ERR_PTR(-ENODEV); + + if (IS_DGFX(to_xe_device(bo->ttm.base.dev)) && + intel_fb_rc_ccs_cc_plane(&fb->base) >= 0 && + !(bo->flags & XE_BO_NEEDS_CPU_ACCESS)) { + struct xe_tile *tile = xe_device_get_root_tile(xe); + + /* + * If we need to able to access the clear-color value stored in + * the buffer, then we require that such buffers are also CPU + * accessible. This is important on small-bar systems where + * only some subset of VRAM is CPU accessible. + */ + if (tile->mem.vram.io_size < tile->mem.vram.usable_size) { + ret = -EINVAL; + goto err; + } + } + + /* + * Pin the framebuffer, we can't use xe_bo_(un)pin functions as the + * assumptions are incorrect for framebuffers + */ + ret = ttm_bo_reserve(&bo->ttm, false, false, NULL); + if (ret) + goto err; + + if (IS_DGFX(xe)) + ret = xe_bo_migrate(bo, XE_PL_VRAM0); + else + ret = xe_bo_validate(bo, NULL, true); + if (!ret) + ttm_bo_pin(&bo->ttm); + ttm_bo_unreserve(&bo->ttm); + if (ret) + goto err; + + vma->bo = bo; + if (intel_fb_uses_dpt(&fb->base)) + ret = __xe_pin_fb_vma_dpt(fb, view, vma); + else + ret = __xe_pin_fb_vma_ggtt(fb, view, vma); + if (ret) + goto err_unpin; + + return vma; + +err_unpin: + ttm_bo_reserve(&bo->ttm, false, false, NULL); + ttm_bo_unpin(&bo->ttm); + ttm_bo_unreserve(&bo->ttm); +err: + kfree(vma); + return ERR_PTR(ret); +} + +static void __xe_unpin_fb_vma(struct i915_vma *vma) +{ + struct xe_device *xe = to_xe_device(vma->bo->ttm.base.dev); + struct xe_ggtt *ggtt = xe_device_get_root_tile(xe)->mem.ggtt; + + if (vma->dpt) + xe_bo_unpin_map_no_vm(vma->dpt); + else if (!drm_mm_node_allocated(&vma->bo->ggtt_node) || + vma->bo->ggtt_node.start != vma->node.start) + xe_ggtt_remove_node(ggtt, &vma->node); + + ttm_bo_reserve(&vma->bo->ttm, false, false, NULL); + ttm_bo_unpin(&vma->bo->ttm); + ttm_bo_unreserve(&vma->bo->ttm); + kfree(vma); +} + +struct i915_vma * +intel_pin_and_fence_fb_obj(struct drm_framebuffer *fb, + bool phys_cursor, + const struct i915_gtt_view *view, + bool uses_fence, + unsigned long *out_flags) +{ + *out_flags = 0; + + return __xe_pin_fb_vma(to_intel_framebuffer(fb), view); +} + +void intel_unpin_fb_vma(struct i915_vma *vma, unsigned long flags) +{ + __xe_unpin_fb_vma(vma); +} + +int intel_plane_pin_fb(struct intel_plane_state *plane_state) +{ + struct drm_framebuffer *fb = plane_state->hw.fb; + struct xe_bo *bo = intel_fb_obj(fb); + struct i915_vma *vma; + + /* We reject creating !SCANOUT fb's, so this is weird.. */ + drm_WARN_ON(bo->ttm.base.dev, !(bo->flags & XE_BO_SCANOUT_BIT)); + + vma = __xe_pin_fb_vma(to_intel_framebuffer(fb), &plane_state->view.gtt); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + plane_state->ggtt_vma = vma; + return 0; +} + +void intel_plane_unpin_fb(struct intel_plane_state *old_plane_state) +{ + __xe_unpin_fb_vma(old_plane_state->ggtt_vma); + old_plane_state->ggtt_vma = NULL; +} + +/* + * For Xe introduce dummy intel_dpt_create which just return NULL and + * intel_dpt_destroy which does nothing. + */ +struct i915_address_space *intel_dpt_create(struct intel_framebuffer *fb) +{ + return NULL; +} + +void intel_dpt_destroy(struct i915_address_space *vm) +{ + return; +}
\ No newline at end of file diff --git a/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c b/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c new file mode 100644 index 000000000000..0f11a39333e2 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2023, Intel Corporation. + */ + +#include "i915_drv.h" +#include "intel_hdcp_gsc.h" + +bool intel_hdcp_gsc_cs_required(struct drm_i915_private *i915) +{ + return true; +} + +bool intel_hdcp_gsc_check_status(struct drm_i915_private *i915) +{ + return false; +} + +int intel_hdcp_gsc_init(struct drm_i915_private *i915) +{ + drm_info(&i915->drm, "HDCP support not yet implemented\n"); + return -ENODEV; +} + +void intel_hdcp_gsc_fini(struct drm_i915_private *i915) +{ +} + +ssize_t intel_hdcp_gsc_msg_send(struct drm_i915_private *i915, u8 *msg_in, + size_t msg_in_len, u8 *msg_out, + size_t msg_out_len) +{ + return -ENODEV; +} diff --git a/drivers/gpu/drm/xe/display/xe_plane_initial.c b/drivers/gpu/drm/xe/display/xe_plane_initial.c new file mode 100644 index 000000000000..ccf83c12b545 --- /dev/null +++ b/drivers/gpu/drm/xe/display/xe_plane_initial.c @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +/* for ioread64 */ +#include <linux/io-64-nonatomic-lo-hi.h> + +#include "xe_ggtt.h" + +#include "i915_drv.h" +#include "intel_atomic_plane.h" +#include "intel_display.h" +#include "intel_display_types.h" +#include "intel_fb.h" +#include "intel_fb_pin.h" +#include "intel_frontbuffer.h" +#include "intel_plane_initial.h" + +static bool +intel_reuse_initial_plane_obj(struct drm_i915_private *i915, + const struct intel_initial_plane_config *plane_config, + struct drm_framebuffer **fb) +{ + struct intel_crtc *crtc; + + for_each_intel_crtc(&i915->drm, crtc) { + struct intel_crtc_state *crtc_state = + to_intel_crtc_state(crtc->base.state); + struct intel_plane *plane = + to_intel_plane(crtc->base.primary); + struct intel_plane_state *plane_state = + to_intel_plane_state(plane->base.state); + + if (!crtc_state->uapi.active) + continue; + + if (!plane_state->ggtt_vma) + continue; + + if (intel_plane_ggtt_offset(plane_state) == plane_config->base) { + *fb = plane_state->hw.fb; + return true; + } + } + + return false; +} + +static struct xe_bo * +initial_plane_bo(struct xe_device *xe, + struct intel_initial_plane_config *plane_config) +{ + struct xe_tile *tile0 = xe_device_get_root_tile(xe); + struct xe_bo *bo; + resource_size_t phys_base; + u32 base, size, flags; + u64 page_size = xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K; + + if (plane_config->size == 0) + return NULL; + + flags = XE_BO_CREATE_PINNED_BIT | XE_BO_SCANOUT_BIT | XE_BO_CREATE_GGTT_BIT; + + base = round_down(plane_config->base, page_size); + if (IS_DGFX(xe)) { + u64 __iomem *gte = tile0->mem.ggtt->gsm; + u64 pte; + + gte += base / XE_PAGE_SIZE; + + pte = ioread64(gte); + if (!(pte & XE_GGTT_PTE_DM)) { + drm_err(&xe->drm, + "Initial plane programming missing DM bit\n"); + return NULL; + } + + phys_base = pte & ~(page_size - 1); + flags |= XE_BO_CREATE_VRAM0_BIT; + + /* + * We don't currently expect this to ever be placed in the + * stolen portion. + */ + if (phys_base >= tile0->mem.vram.usable_size) { + drm_err(&xe->drm, + "Initial plane programming using invalid range, phys_base=%pa\n", + &phys_base); + return NULL; + } + + drm_dbg(&xe->drm, + "Using phys_base=%pa, based on initial plane programming\n", + &phys_base); + } else { + struct ttm_resource_manager *stolen = ttm_manager_type(&xe->ttm, XE_PL_STOLEN); + + if (!stolen) + return NULL; + phys_base = base; + flags |= XE_BO_CREATE_STOLEN_BIT; + + /* + * If the FB is too big, just don't use it since fbdev is not very + * important and we should probably use that space with FBC or other + * features. + */ + if (IS_ENABLED(CONFIG_FRAMEBUFFER_CONSOLE) && + plane_config->size * 2 >> PAGE_SHIFT >= stolen->size) + return NULL; + } + + size = round_up(plane_config->base + plane_config->size, + page_size); + size -= base; + + bo = xe_bo_create_pin_map_at(xe, tile0, NULL, size, phys_base, + ttm_bo_type_kernel, flags); + if (IS_ERR(bo)) { + drm_dbg(&xe->drm, + "Failed to create bo phys_base=%pa size %u with flags %x: %li\n", + &phys_base, size, flags, PTR_ERR(bo)); + return NULL; + } + + return bo; +} + +static bool +intel_alloc_initial_plane_obj(struct intel_crtc *crtc, + struct intel_initial_plane_config *plane_config) +{ + struct drm_device *dev = crtc->base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); + struct drm_mode_fb_cmd2 mode_cmd = { 0 }; + struct drm_framebuffer *fb = &plane_config->fb->base; + struct xe_bo *bo; + + switch (fb->modifier) { + case DRM_FORMAT_MOD_LINEAR: + case I915_FORMAT_MOD_X_TILED: + case I915_FORMAT_MOD_Y_TILED: + case I915_FORMAT_MOD_4_TILED: + break; + default: + drm_dbg(&dev_priv->drm, + "Unsupported modifier for initial FB: 0x%llx\n", + fb->modifier); + return false; + } + + mode_cmd.pixel_format = fb->format->format; + mode_cmd.width = fb->width; + mode_cmd.height = fb->height; + mode_cmd.pitches[0] = fb->pitches[0]; + mode_cmd.modifier[0] = fb->modifier; + mode_cmd.flags = DRM_MODE_FB_MODIFIERS; + + bo = initial_plane_bo(dev_priv, plane_config); + if (!bo) + return false; + + if (intel_framebuffer_init(to_intel_framebuffer(fb), + bo, &mode_cmd)) { + drm_dbg_kms(&dev_priv->drm, "intel fb init failed\n"); + goto err_bo; + } + /* Reference handed over to fb */ + xe_bo_put(bo); + + return true; + +err_bo: + xe_bo_unpin_map_no_vm(bo); + return false; +} + +static void +intel_find_initial_plane_obj(struct intel_crtc *crtc, + struct intel_initial_plane_config *plane_config) +{ + struct drm_device *dev = crtc->base.dev; + struct drm_i915_private *dev_priv = to_i915(dev); + struct intel_plane *plane = + to_intel_plane(crtc->base.primary); + struct intel_plane_state *plane_state = + to_intel_plane_state(plane->base.state); + struct intel_crtc_state *crtc_state = + to_intel_crtc_state(crtc->base.state); + struct drm_framebuffer *fb; + struct i915_vma *vma; + + /* + * TODO: + * Disable planes if get_initial_plane_config() failed. + * Make sure things work if the surface base is not page aligned. + */ + if (!plane_config->fb) + return; + + if (intel_alloc_initial_plane_obj(crtc, plane_config)) + fb = &plane_config->fb->base; + else if (!intel_reuse_initial_plane_obj(dev_priv, plane_config, &fb)) + goto nofb; + + plane_state->uapi.rotation = plane_config->rotation; + intel_fb_fill_view(to_intel_framebuffer(fb), + plane_state->uapi.rotation, &plane_state->view); + + vma = intel_pin_and_fence_fb_obj(fb, false, &plane_state->view.gtt, + false, &plane_state->flags); + if (IS_ERR(vma)) + goto nofb; + + plane_state->ggtt_vma = vma; + plane_state->uapi.src_x = 0; + plane_state->uapi.src_y = 0; + plane_state->uapi.src_w = fb->width << 16; + plane_state->uapi.src_h = fb->height << 16; + + plane_state->uapi.crtc_x = 0; + plane_state->uapi.crtc_y = 0; + plane_state->uapi.crtc_w = fb->width; + plane_state->uapi.crtc_h = fb->height; + + plane_state->uapi.fb = fb; + drm_framebuffer_get(fb); + + plane_state->uapi.crtc = &crtc->base; + intel_plane_copy_uapi_to_hw_state(plane_state, plane_state, crtc); + + atomic_or(plane->frontbuffer_bit, &to_intel_frontbuffer(fb)->bits); + + plane_config->vma = vma; + + /* + * Flip to the newly created mapping ASAP, so we can re-use the + * first part of GGTT for WOPCM, prevent flickering, and prevent + * the lookup of sysmem scratch pages. + */ + plane->check_plane(crtc_state, plane_state); + plane->async_flip(plane, crtc_state, plane_state, true); + return; + +nofb: + /* + * We've failed to reconstruct the BIOS FB. Current display state + * indicates that the primary plane is visible, but has a NULL FB, + * which will lead to problems later if we don't fix it up. The + * simplest solution is to just disable the primary plane now and + * pretend the BIOS never had it enabled. + */ + intel_plane_disable_noatomic(crtc, plane); +} + +static void plane_config_fini(struct intel_initial_plane_config *plane_config) +{ + if (plane_config->fb) { + struct drm_framebuffer *fb = &plane_config->fb->base; + + /* We may only have the stub and not a full framebuffer */ + if (drm_framebuffer_read_refcount(fb)) + drm_framebuffer_put(fb); + else + kfree(fb); + } +} + +void intel_crtc_initial_plane_config(struct intel_crtc *crtc) +{ + struct xe_device *xe = to_xe_device(crtc->base.dev); + struct intel_initial_plane_config plane_config = {}; + + /* + * Note that reserving the BIOS fb up front prevents us + * from stuffing other stolen allocations like the ring + * on top. This prevents some ugliness at boot time, and + * can even allow for smooth boot transitions if the BIOS + * fb is large enough for the active pipe configuration. + */ + xe->display.funcs.display->get_initial_plane_config(crtc, &plane_config); + + /* + * If the fb is shared between multiple heads, we'll + * just get the first one. + */ + intel_find_initial_plane_obj(crtc, &plane_config); + + plane_config_fini(&plane_config); +} diff --git a/drivers/gpu/drm/xe/instructions/xe_gfxpipe_commands.h b/drivers/gpu/drm/xe/instructions/xe_gfxpipe_commands.h new file mode 100644 index 000000000000..8e6dd061f2ae --- /dev/null +++ b/drivers/gpu/drm/xe/instructions/xe_gfxpipe_commands.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GFXPIPE_COMMANDS_H_ +#define _XE_GFXPIPE_COMMANDS_H_ + +#include "instructions/xe_instr_defs.h" + +#define GFXPIPE_PIPELINE REG_GENMASK(28, 27) +#define PIPELINE_COMMON REG_FIELD_PREP(GFXPIPE_PIPELINE, 0x0) +#define PIPELINE_SINGLE_DW REG_FIELD_PREP(GFXPIPE_PIPELINE, 0x1) +#define PIPELINE_COMPUTE REG_FIELD_PREP(GFXPIPE_PIPELINE, 0x2) +#define PIPELINE_3D REG_FIELD_PREP(GFXPIPE_PIPELINE, 0x3) + +#define GFXPIPE_OPCODE REG_GENMASK(26, 24) +#define GFXPIPE_SUBOPCODE REG_GENMASK(23, 16) + +#define GFXPIPE_MATCH_MASK (XE_INSTR_CMD_TYPE | \ + GFXPIPE_PIPELINE | \ + GFXPIPE_OPCODE | \ + GFXPIPE_SUBOPCODE) + +#define GFXPIPE_COMMON_CMD(opcode, subopcode) \ + (XE_INSTR_GFXPIPE | PIPELINE_COMMON | \ + REG_FIELD_PREP(GFXPIPE_OPCODE, opcode) | \ + REG_FIELD_PREP(GFXPIPE_SUBOPCODE, subopcode)) + +#define GFXPIPE_SINGLE_DW_CMD(opcode, subopcode) \ + (XE_INSTR_GFXPIPE | PIPELINE_SINGLE_DW | \ + REG_FIELD_PREP(GFXPIPE_OPCODE, opcode) | \ + REG_FIELD_PREP(GFXPIPE_SUBOPCODE, subopcode)) + +#define GFXPIPE_3D_CMD(opcode, subopcode) \ + (XE_INSTR_GFXPIPE | PIPELINE_3D | \ + REG_FIELD_PREP(GFXPIPE_OPCODE, opcode) | \ + REG_FIELD_PREP(GFXPIPE_SUBOPCODE, subopcode)) + +#define GFXPIPE_COMPUTE_CMD(opcode, subopcode) \ + (XE_INSTR_GFXPIPE | PIPELINE_COMPUTE | \ + REG_FIELD_PREP(GFXPIPE_OPCODE, opcode) | \ + REG_FIELD_PREP(GFXPIPE_SUBOPCODE, subopcode)) + +#define STATE_BASE_ADDRESS GFXPIPE_COMMON_CMD(0x1, 0x1) +#define STATE_SIP GFXPIPE_COMMON_CMD(0x1, 0x2) +#define GPGPU_CSR_BASE_ADDRESS GFXPIPE_COMMON_CMD(0x1, 0x4) +#define STATE_COMPUTE_MODE GFXPIPE_COMMON_CMD(0x1, 0x5) +#define CMD_3DSTATE_BTD GFXPIPE_COMMON_CMD(0x1, 0x6) + +#define CMD_3DSTATE_VF_STATISTICS GFXPIPE_SINGLE_DW_CMD(0x0, 0xB) + +#define PIPELINE_SELECT GFXPIPE_SINGLE_DW_CMD(0x1, 0x4) + +#define CMD_3DSTATE_DRAWING_RECTANGLE_FAST GFXPIPE_3D_CMD(0x0, 0x0) +#define CMD_3DSTATE_CLEAR_PARAMS GFXPIPE_3D_CMD(0x0, 0x4) +#define CMD_3DSTATE_DEPTH_BUFFER GFXPIPE_3D_CMD(0x0, 0x5) +#define CMD_3DSTATE_STENCIL_BUFFER GFXPIPE_3D_CMD(0x0, 0x6) +#define CMD_3DSTATE_HIER_DEPTH_BUFFER GFXPIPE_3D_CMD(0x0, 0x7) +#define CMD_3DSTATE_VERTEX_BUFFERS GFXPIPE_3D_CMD(0x0, 0x8) +#define CMD_3DSTATE_VERTEX_ELEMENTS GFXPIPE_3D_CMD(0x0, 0x9) +#define CMD_3DSTATE_INDEX_BUFFER GFXPIPE_3D_CMD(0x0, 0xA) +#define CMD_3DSTATE_VF GFXPIPE_3D_CMD(0x0, 0xC) +#define CMD_3DSTATE_MULTISAMPLE GFXPIPE_3D_CMD(0x0, 0xD) +#define CMD_3DSTATE_CC_STATE_POINTERS GFXPIPE_3D_CMD(0x0, 0xE) +#define CMD_3DSTATE_SCISSOR_STATE_POINTERS GFXPIPE_3D_CMD(0x0, 0xF) +#define CMD_3DSTATE_VS GFXPIPE_3D_CMD(0x0, 0x10) +#define CMD_3DSTATE_GS GFXPIPE_3D_CMD(0x0, 0x11) +#define CMD_3DSTATE_CLIP GFXPIPE_3D_CMD(0x0, 0x12) +#define CMD_3DSTATE_SF GFXPIPE_3D_CMD(0x0, 0x13) +#define CMD_3DSTATE_WM GFXPIPE_3D_CMD(0x0, 0x14) +#define CMD_3DSTATE_CONSTANT_VS GFXPIPE_3D_CMD(0x0, 0x15) +#define CMD_3DSTATE_CONSTANT_GS GFXPIPE_3D_CMD(0x0, 0x16) +#define CMD_3DSTATE_SAMPLE_MASK GFXPIPE_3D_CMD(0x0, 0x18) +#define CMD_3DSTATE_CONSTANT_HS GFXPIPE_3D_CMD(0x0, 0x19) +#define CMD_3DSTATE_CONSTANT_DS GFXPIPE_3D_CMD(0x0, 0x1A) +#define CMD_3DSTATE_HS GFXPIPE_3D_CMD(0x0, 0x1B) +#define CMD_3DSTATE_TE GFXPIPE_3D_CMD(0x0, 0x1C) +#define CMD_3DSTATE_DS GFXPIPE_3D_CMD(0x0, 0x1D) +#define CMD_3DSTATE_STREAMOUT GFXPIPE_3D_CMD(0x0, 0x1E) +#define CMD_3DSTATE_SBE GFXPIPE_3D_CMD(0x0, 0x1F) +#define CMD_3DSTATE_PS GFXPIPE_3D_CMD(0x0, 0x20) +#define CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP GFXPIPE_3D_CMD(0x0, 0x21) +#define CMD_3DSTATE_CPS_POINTERS GFXPIPE_3D_CMD(0x0, 0x22) +#define CMD_3DSTATE_VIEWPORT_STATE_POINTERS_CC GFXPIPE_3D_CMD(0x0, 0x23) +#define CMD_3DSTATE_BLEND_STATE_POINTERS GFXPIPE_3D_CMD(0x0, 0x24) +#define CMD_3DSTATE_BINDING_TABLE_POINTERS_VS GFXPIPE_3D_CMD(0x0, 0x26) +#define CMD_3DSTATE_BINDING_TABLE_POINTERS_HS GFXPIPE_3D_CMD(0x0, 0x27) +#define CMD_3DSTATE_BINDING_TABLE_POINTERS_DS GFXPIPE_3D_CMD(0x0, 0x28) +#define CMD_3DSTATE_BINDING_TABLE_POINTERS_GS GFXPIPE_3D_CMD(0x0, 0x29) +#define CMD_3DSTATE_BINDING_TABLE_POINTERS_PS GFXPIPE_3D_CMD(0x0, 0x2A) +#define CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS GFXPIPE_3D_CMD(0x0, 0x2B) +#define CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS GFXPIPE_3D_CMD(0x0, 0x2C) +#define CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS GFXPIPE_3D_CMD(0x0, 0x2D) +#define CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS GFXPIPE_3D_CMD(0x0, 0x2E) +#define CMD_3DSTATE_SAMPLER_STATE_POINTERS_PS GFXPIPE_3D_CMD(0x0, 0x2F) +#define CMD_3DSTATE_VF_INSTANCING GFXPIPE_3D_CMD(0x0, 0x49) +#define CMD_3DSTATE_VF_SGVS GFXPIPE_3D_CMD(0x0, 0x4A) +#define CMD_3DSTATE_VF_TOPOLOGY GFXPIPE_3D_CMD(0x0, 0x4B) +#define CMD_3DSTATE_WM_CHROMAKEY GFXPIPE_3D_CMD(0x0, 0x4C) +#define CMD_3DSTATE_PS_BLEND GFXPIPE_3D_CMD(0x0, 0x4D) +#define CMD_3DSTATE_WM_DEPTH_STENCIL GFXPIPE_3D_CMD(0x0, 0x4E) +#define CMD_3DSTATE_PS_EXTRA GFXPIPE_3D_CMD(0x0, 0x4F) +#define CMD_3DSTATE_RASTER GFXPIPE_3D_CMD(0x0, 0x50) +#define CMD_3DSTATE_SBE_SWIZ GFXPIPE_3D_CMD(0x0, 0x51) +#define CMD_3DSTATE_WM_HZ_OP GFXPIPE_3D_CMD(0x0, 0x52) +#define CMD_3DSTATE_VF_COMPONENT_PACKING GFXPIPE_3D_CMD(0x0, 0x55) +#define CMD_3DSTATE_VF_SGVS_2 GFXPIPE_3D_CMD(0x0, 0x56) +#define CMD_3DSTATE_VFG GFXPIPE_3D_CMD(0x0, 0x57) +#define CMD_3DSTATE_URB_ALLOC_VS GFXPIPE_3D_CMD(0x0, 0x58) +#define CMD_3DSTATE_URB_ALLOC_HS GFXPIPE_3D_CMD(0x0, 0x59) +#define CMD_3DSTATE_URB_ALLOC_DS GFXPIPE_3D_CMD(0x0, 0x5A) +#define CMD_3DSTATE_URB_ALLOC_GS GFXPIPE_3D_CMD(0x0, 0x5B) +#define CMD_3DSTATE_SO_BUFFER_INDEX_0 GFXPIPE_3D_CMD(0x0, 0x60) +#define CMD_3DSTATE_SO_BUFFER_INDEX_1 GFXPIPE_3D_CMD(0x0, 0x61) +#define CMD_3DSTATE_SO_BUFFER_INDEX_2 GFXPIPE_3D_CMD(0x0, 0x62) +#define CMD_3DSTATE_SO_BUFFER_INDEX_3 GFXPIPE_3D_CMD(0x0, 0x63) +#define CMD_3DSTATE_PRIMITIVE_REPLICATION GFXPIPE_3D_CMD(0x0, 0x6C) +#define CMD_3DSTATE_TBIMR_TILE_PASS_INFO GFXPIPE_3D_CMD(0x0, 0x6E) +#define CMD_3DSTATE_AMFS GFXPIPE_3D_CMD(0x0, 0x6F) +#define CMD_3DSTATE_DEPTH_BOUNDS GFXPIPE_3D_CMD(0x0, 0x71) +#define CMD_3DSTATE_AMFS_TEXTURE_POINTERS GFXPIPE_3D_CMD(0x0, 0x72) +#define CMD_3DSTATE_CONSTANT_TS_POINTER GFXPIPE_3D_CMD(0x0, 0x73) +#define CMD_3DSTATE_MESH_CONTROL GFXPIPE_3D_CMD(0x0, 0x77) +#define CMD_3DSTATE_MESH_DISTRIB GFXPIPE_3D_CMD(0x0, 0x78) +#define CMD_3DSTATE_TASK_REDISTRIB GFXPIPE_3D_CMD(0x0, 0x79) +#define CMD_3DSTATE_MESH_SHADER GFXPIPE_3D_CMD(0x0, 0x7A) +#define CMD_3DSTATE_MESH_SHADER_DATA GFXPIPE_3D_CMD(0x0, 0x7B) +#define CMD_3DSTATE_TASK_CONTROL GFXPIPE_3D_CMD(0x0, 0x7C) +#define CMD_3DSTATE_TASK_SHADER GFXPIPE_3D_CMD(0x0, 0x7D) +#define CMD_3DSTATE_TASK_SHADER_DATA GFXPIPE_3D_CMD(0x0, 0x7E) +#define CMD_3DSTATE_URB_ALLOC_MESH GFXPIPE_3D_CMD(0x0, 0x7F) +#define CMD_3DSTATE_URB_ALLOC_TASK GFXPIPE_3D_CMD(0x0, 0x80) +#define CMD_3DSTATE_CLIP_MESH GFXPIPE_3D_CMD(0x0, 0x81) +#define CMD_3DSTATE_SBE_MESH GFXPIPE_3D_CMD(0x0, 0x82) +#define CMD_3DSTATE_CPSIZE_CONTROL_BUFFER GFXPIPE_3D_CMD(0x0, 0x83) + +#define CMD_3DSTATE_DRAWING_RECTANGLE GFXPIPE_3D_CMD(0x1, 0x0) +#define CMD_3DSTATE_CHROMA_KEY GFXPIPE_3D_CMD(0x1, 0x4) +#define CMD_3DSTATE_POLY_STIPPLE_OFFSET GFXPIPE_3D_CMD(0x1, 0x6) +#define CMD_3DSTATE_POLY_STIPPLE_PATTERN GFXPIPE_3D_CMD(0x1, 0x7) +#define CMD_3DSTATE_LINE_STIPPLE GFXPIPE_3D_CMD(0x1, 0x8) +#define CMD_3DSTATE_AA_LINE_PARAMETERS GFXPIPE_3D_CMD(0x1, 0xA) +#define CMD_3DSTATE_MONOFILTER_SIZE GFXPIPE_3D_CMD(0x1, 0x11) +#define CMD_3DSTATE_PUSH_CONSTANT_ALLOC_VS GFXPIPE_3D_CMD(0x1, 0x12) +#define CMD_3DSTATE_PUSH_CONSTANT_ALLOC_HS GFXPIPE_3D_CMD(0x1, 0x13) +#define CMD_3DSTATE_PUSH_CONSTANT_ALLOC_DS GFXPIPE_3D_CMD(0x1, 0x14) +#define CMD_3DSTATE_PUSH_CONSTANT_ALLOC_GS GFXPIPE_3D_CMD(0x1, 0x15) +#define CMD_3DSTATE_PUSH_CONSTANT_ALLOC_PS GFXPIPE_3D_CMD(0x1, 0x16) +#define CMD_3DSTATE_SO_DECL_LIST GFXPIPE_3D_CMD(0x1, 0x17) +#define CMD_3DSTATE_SO_DECL_LIST_DW_LEN REG_GENMASK(8, 0) +#define CMD_3DSTATE_SO_BUFFER GFXPIPE_3D_CMD(0x1, 0x18) +#define CMD_3DSTATE_BINDING_TABLE_POOL_ALLOC GFXPIPE_3D_CMD(0x1, 0x19) +#define CMD_3DSTATE_SAMPLE_PATTERN GFXPIPE_3D_CMD(0x1, 0x1C) +#define CMD_3DSTATE_3D_MODE GFXPIPE_3D_CMD(0x1, 0x1E) +#define CMD_3DSTATE_SUBSLICE_HASH_TABLE GFXPIPE_3D_CMD(0x1, 0x1F) +#define CMD_3DSTATE_SLICE_TABLE_STATE_POINTERS GFXPIPE_3D_CMD(0x1, 0x20) +#define CMD_3DSTATE_PTBR_TILE_PASS_INFO GFXPIPE_3D_CMD(0x1, 0x22) + +#endif diff --git a/drivers/gpu/drm/xe/instructions/xe_gsc_commands.h b/drivers/gpu/drm/xe/instructions/xe_gsc_commands.h new file mode 100644 index 000000000000..f8949cad9d0f --- /dev/null +++ b/drivers/gpu/drm/xe/instructions/xe_gsc_commands.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GSC_COMMANDS_H_ +#define _XE_GSC_COMMANDS_H_ + +#include "instructions/xe_instr_defs.h" + +/* + * All GSCCS-specific commands have fixed length, so we can include it in the + * defines. Note that the generic GSC command header structure includes an + * optional data field in bits 9-21, but there are no commands that actually use + * it; some of the commands are instead defined as having an extended length + * field spanning bits 0-15, even if the extra bits are not required because the + * longest GSCCS command is only 8 dwords. To handle this, the defines below use + * a single field for both data and len. If we ever get a commands that does + * actually have data and this approach doesn't work for it we can re-work it + * at that point. + */ + +#define GSC_OPCODE REG_GENMASK(28, 22) +#define GSC_CMD_DATA_AND_LEN REG_GENMASK(21, 0) + +#define __GSC_INSTR(op, dl) \ + (XE_INSTR_GSC | \ + REG_FIELD_PREP(GSC_OPCODE, op) | \ + REG_FIELD_PREP(GSC_CMD_DATA_AND_LEN, dl)) + +#define GSC_HECI_CMD_PKT __GSC_INSTR(0, 6) + +#define GSC_FW_LOAD __GSC_INSTR(1, 2) +#define GSC_FW_LOAD_LIMIT_VALID REG_BIT(31) + +#endif diff --git a/drivers/gpu/drm/xe/instructions/xe_instr_defs.h b/drivers/gpu/drm/xe/instructions/xe_instr_defs.h new file mode 100644 index 000000000000..04179b2a48e1 --- /dev/null +++ b/drivers/gpu/drm/xe/instructions/xe_instr_defs.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_INSTR_DEFS_H_ +#define _XE_INSTR_DEFS_H_ + +#include "regs/xe_reg_defs.h" + +/* + * The first dword of any GPU instruction is the "instruction header." Bits + * 31:29 identify the general type of the command and determine how exact + * opcodes and sub-opcodes will be encoded in the remaining bits. + */ +#define XE_INSTR_CMD_TYPE GENMASK(31, 29) +#define XE_INSTR_MI REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x0) +#define XE_INSTR_GSC REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x2) +#define XE_INSTR_GFXPIPE REG_FIELD_PREP(XE_INSTR_CMD_TYPE, 0x3) + +/* + * Most (but not all) instructions have a "length" field in the instruction + * header. The value expected is the total number of dwords for the + * instruction, minus two. + * + * Some instructions have length fields longer or shorter than 8 bits, but + * those are rare. This definition can be used for the common case where + * the length field is from 7:0. + */ +#define XE_INSTR_LEN_MASK GENMASK(7, 0) +#define XE_INSTR_NUM_DW(x) REG_FIELD_PREP(XE_INSTR_LEN_MASK, (x) - 2) + +#endif diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h new file mode 100644 index 000000000000..1cfa96167fde --- /dev/null +++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_MI_COMMANDS_H_ +#define _XE_MI_COMMANDS_H_ + +#include "instructions/xe_instr_defs.h" + +/* + * MI (Memory Interface) commands are supported by all GT engines. They + * provide general memory operations and command streamer control. MI commands + * have a command type of 0x0 (MI_COMMAND) in bits 31:29 of the instruction + * header dword and a specific MI opcode in bits 28:23. + */ + +#define MI_OPCODE REG_GENMASK(28, 23) +#define MI_SUBOPCODE REG_GENMASK(22, 17) /* used with MI_EXPANSION */ + +#define __MI_INSTR(opcode) \ + (XE_INSTR_MI | REG_FIELD_PREP(MI_OPCODE, opcode)) + +#define MI_NOOP __MI_INSTR(0x0) +#define MI_USER_INTERRUPT __MI_INSTR(0x2) +#define MI_ARB_CHECK __MI_INSTR(0x5) + +#define MI_ARB_ON_OFF __MI_INSTR(0x8) +#define MI_ARB_ENABLE REG_BIT(0) +#define MI_ARB_DISABLE 0x0 + +#define MI_BATCH_BUFFER_END __MI_INSTR(0xA) +#define MI_TOPOLOGY_FILTER __MI_INSTR(0xD) +#define MI_FORCE_WAKEUP __MI_INSTR(0x1D) + +#define MI_STORE_DATA_IMM __MI_INSTR(0x20) +#define MI_SDI_GGTT REG_BIT(22) +#define MI_SDI_LEN_DW GENMASK(9, 0) +#define MI_SDI_NUM_DW(x) REG_FIELD_PREP(MI_SDI_LEN_DW, (x) + 3 - 2) +#define MI_SDI_NUM_QW(x) (REG_FIELD_PREP(MI_SDI_LEN_DW, 2 * (x) + 3 - 2) | \ + REG_BIT(21)) + +#define MI_LOAD_REGISTER_IMM __MI_INSTR(0x22) +#define MI_LRI_LRM_CS_MMIO REG_BIT(19) +#define MI_LRI_MMIO_REMAP_EN REG_BIT(17) +#define MI_LRI_NUM_REGS(x) XE_INSTR_NUM_DW(2 * (x) + 1) +#define MI_LRI_FORCE_POSTED REG_BIT(12) + +#define MI_FLUSH_DW __MI_INSTR(0x26) +#define MI_FLUSH_DW_STORE_INDEX REG_BIT(21) +#define MI_INVALIDATE_TLB REG_BIT(18) +#define MI_FLUSH_DW_CCS REG_BIT(16) +#define MI_FLUSH_DW_OP_STOREDW REG_BIT(14) +#define MI_FLUSH_DW_LEN_DW REG_GENMASK(5, 0) +#define MI_FLUSH_IMM_DW REG_FIELD_PREP(MI_FLUSH_DW_LEN_DW, 4 - 2) +#define MI_FLUSH_IMM_QW REG_FIELD_PREP(MI_FLUSH_DW_LEN_DW, 5 - 2) +#define MI_FLUSH_DW_USE_GTT REG_BIT(2) + +#define MI_BATCH_BUFFER_START __MI_INSTR(0x31) + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h new file mode 100644 index 000000000000..5592774fc690 --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_ENGINE_REGS_H_ +#define _XE_ENGINE_REGS_H_ + +#include <asm/page.h> + +#include "regs/xe_reg_defs.h" + +/* + * These *_BASE values represent the MMIO offset where each hardware engine's + * registers start. The other definitions in this header are parameterized + * macros that will take one of these values as a parameter. + */ +#define RENDER_RING_BASE 0x02000 +#define BSD_RING_BASE 0x1c0000 +#define BSD2_RING_BASE 0x1c4000 +#define BSD3_RING_BASE 0x1d0000 +#define BSD4_RING_BASE 0x1d4000 +#define XEHP_BSD5_RING_BASE 0x1e0000 +#define XEHP_BSD6_RING_BASE 0x1e4000 +#define XEHP_BSD7_RING_BASE 0x1f0000 +#define XEHP_BSD8_RING_BASE 0x1f4000 +#define VEBOX_RING_BASE 0x1c8000 +#define VEBOX2_RING_BASE 0x1d8000 +#define XEHP_VEBOX3_RING_BASE 0x1e8000 +#define XEHP_VEBOX4_RING_BASE 0x1f8000 +#define COMPUTE0_RING_BASE 0x1a000 +#define COMPUTE1_RING_BASE 0x1c000 +#define COMPUTE2_RING_BASE 0x1e000 +#define COMPUTE3_RING_BASE 0x26000 +#define BLT_RING_BASE 0x22000 +#define XEHPC_BCS1_RING_BASE 0x3e0000 +#define XEHPC_BCS2_RING_BASE 0x3e2000 +#define XEHPC_BCS3_RING_BASE 0x3e4000 +#define XEHPC_BCS4_RING_BASE 0x3e6000 +#define XEHPC_BCS5_RING_BASE 0x3e8000 +#define XEHPC_BCS6_RING_BASE 0x3ea000 +#define XEHPC_BCS7_RING_BASE 0x3ec000 +#define XEHPC_BCS8_RING_BASE 0x3ee000 +#define GSCCS_RING_BASE 0x11a000 + +#define RING_TAIL(base) XE_REG((base) + 0x30) + +#define RING_HEAD(base) XE_REG((base) + 0x34) +#define HEAD_ADDR 0x001FFFFC + +#define RING_START(base) XE_REG((base) + 0x38) + +#define RING_CTL(base) XE_REG((base) + 0x3c) +#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */ +#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */ + +#define RING_PSMI_CTL(base) XE_REG((base) + 0x50, XE_REG_OPTION_MASKED) +#define RC_SEMA_IDLE_MSG_DISABLE REG_BIT(12) +#define WAIT_FOR_EVENT_POWER_DOWN_DISABLE REG_BIT(7) +#define IDLE_MSG_DISABLE REG_BIT(0) + +#define RING_PWRCTX_MAXCNT(base) XE_REG((base) + 0x54) +#define IDLE_WAIT_TIME REG_GENMASK(19, 0) + +#define RING_ACTHD_UDW(base) XE_REG((base) + 0x5c) +#define RING_DMA_FADD_UDW(base) XE_REG((base) + 0x60) +#define RING_IPEHR(base) XE_REG((base) + 0x68) +#define RING_ACTHD(base) XE_REG((base) + 0x74) +#define RING_DMA_FADD(base) XE_REG((base) + 0x78) +#define RING_HWS_PGA(base) XE_REG((base) + 0x80) +#define RING_HWSTAM(base) XE_REG((base) + 0x98) +#define RING_MI_MODE(base) XE_REG((base) + 0x9c) +#define RING_NOPID(base) XE_REG((base) + 0x94) + +#define FF_THREAD_MODE(base) XE_REG((base) + 0xa0) +#define FF_TESSELATION_DOP_GATE_DISABLE BIT(19) + +#define RING_IMR(base) XE_REG((base) + 0xa8) + +#define RING_EIR(base) XE_REG((base) + 0xb0) +#define RING_EMR(base) XE_REG((base) + 0xb4) +#define RING_ESR(base) XE_REG((base) + 0xb8) + +#define RING_CMD_CCTL(base) XE_REG((base) + 0xc4, XE_REG_OPTION_MASKED) +/* + * CMD_CCTL read/write fields take a MOCS value and _not_ a table index. + * The lsb of each can be considered a separate enabling bit for encryption. + * 6:0 == default MOCS value for reads => 6:1 == table index for reads. + * 13:7 == default MOCS value for writes => 13:8 == table index for writes. + * 15:14 == Reserved => 31:30 are set to 0. + */ +#define CMD_CCTL_WRITE_OVERRIDE_MASK REG_GENMASK(13, 8) +#define CMD_CCTL_READ_OVERRIDE_MASK REG_GENMASK(6, 1) + +#define CSFE_CHICKEN1(base) XE_REG((base) + 0xd4, XE_REG_OPTION_MASKED) +#define GHWSP_CSB_REPORT_DIS REG_BIT(15) +#define PPHWSP_CSB_AND_TIMESTAMP_REPORT_DIS REG_BIT(14) + +#define FF_SLICE_CS_CHICKEN1(base) XE_REG((base) + 0xe0, XE_REG_OPTION_MASKED) +#define FFSC_PERCTX_PREEMPT_CTRL REG_BIT(14) + +#define FF_SLICE_CS_CHICKEN2(base) XE_REG((base) + 0xe4, XE_REG_OPTION_MASKED) +#define PERF_FIX_BALANCING_CFE_DISABLE REG_BIT(15) + +#define CS_DEBUG_MODE1(base) XE_REG((base) + 0xec, XE_REG_OPTION_MASKED) +#define FF_DOP_CLOCK_GATE_DISABLE REG_BIT(1) +#define REPLAY_MODE_GRANULARITY REG_BIT(0) + +#define RING_BBADDR(base) XE_REG((base) + 0x140) +#define RING_BBADDR_UDW(base) XE_REG((base) + 0x168) + +#define BCS_SWCTRL(base) XE_REG((base) + 0x200, XE_REG_OPTION_MASKED) +#define BCS_SWCTRL_DISABLE_256B REG_BIT(2) + +/* Handling MOCS value in BLIT_CCTL like it was done CMD_CCTL */ +#define BLIT_CCTL(base) XE_REG((base) + 0x204) +#define BLIT_CCTL_DST_MOCS_MASK REG_GENMASK(14, 9) +#define BLIT_CCTL_SRC_MOCS_MASK REG_GENMASK(6, 1) + +#define RING_EXECLIST_STATUS_LO(base) XE_REG((base) + 0x234) +#define RING_EXECLIST_STATUS_HI(base) XE_REG((base) + 0x234 + 4) + +#define RING_CONTEXT_CONTROL(base) XE_REG((base) + 0x244) +#define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH REG_BIT(3) +#define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT REG_BIT(0) + +#define RING_MODE(base) XE_REG((base) + 0x29c) +#define GFX_DISABLE_LEGACY_MODE REG_BIT(3) + +#define RING_TIMESTAMP(base) XE_REG((base) + 0x358) + +#define RING_TIMESTAMP_UDW(base) XE_REG((base) + 0x358 + 4) +#define RING_VALID_MASK 0x00000001 +#define RING_VALID 0x00000001 +#define STOP_RING REG_BIT(8) +#define TAIL_ADDR 0x001FFFF8 + +#define RING_CTX_TIMESTAMP(base) XE_REG((base) + 0x3a8) + +#define RING_FORCE_TO_NONPRIV(base, i) XE_REG(((base) + 0x4d0) + (i) * 4) +#define RING_FORCE_TO_NONPRIV_DENY REG_BIT(30) +#define RING_FORCE_TO_NONPRIV_ACCESS_MASK REG_GENMASK(29, 28) +#define RING_FORCE_TO_NONPRIV_ACCESS_RW REG_FIELD_PREP(RING_FORCE_TO_NONPRIV_ACCESS_MASK, 0) +#define RING_FORCE_TO_NONPRIV_ACCESS_RD REG_FIELD_PREP(RING_FORCE_TO_NONPRIV_ACCESS_MASK, 1) +#define RING_FORCE_TO_NONPRIV_ACCESS_WR REG_FIELD_PREP(RING_FORCE_TO_NONPRIV_ACCESS_MASK, 2) +#define RING_FORCE_TO_NONPRIV_ACCESS_INVALID REG_FIELD_PREP(RING_FORCE_TO_NONPRIV_ACCESS_MASK, 3) +#define RING_FORCE_TO_NONPRIV_ADDRESS_MASK REG_GENMASK(25, 2) +#define RING_FORCE_TO_NONPRIV_RANGE_MASK REG_GENMASK(1, 0) +#define RING_FORCE_TO_NONPRIV_RANGE_1 REG_FIELD_PREP(RING_FORCE_TO_NONPRIV_RANGE_MASK, 0) +#define RING_FORCE_TO_NONPRIV_RANGE_4 REG_FIELD_PREP(RING_FORCE_TO_NONPRIV_RANGE_MASK, 1) +#define RING_FORCE_TO_NONPRIV_RANGE_16 REG_FIELD_PREP(RING_FORCE_TO_NONPRIV_RANGE_MASK, 2) +#define RING_FORCE_TO_NONPRIV_RANGE_64 REG_FIELD_PREP(RING_FORCE_TO_NONPRIV_RANGE_MASK, 3) +#define RING_FORCE_TO_NONPRIV_MASK_VALID (RING_FORCE_TO_NONPRIV_RANGE_MASK | \ + RING_FORCE_TO_NONPRIV_ACCESS_MASK | \ + RING_FORCE_TO_NONPRIV_DENY) +#define RING_MAX_NONPRIV_SLOTS 12 + +#define RING_EXECLIST_SQ_CONTENTS_LO(base) XE_REG((base) + 0x510) +#define RING_EXECLIST_SQ_CONTENTS_HI(base) XE_REG((base) + 0x510 + 4) + +#define RING_EXECLIST_CONTROL(base) XE_REG((base) + 0x550) +#define EL_CTRL_LOAD REG_BIT(0) + +#define CS_CHICKEN1(base) XE_REG((base) + 0x580, XE_REG_OPTION_MASKED) +#define PREEMPT_GPGPU_LEVEL(hi, lo) (((hi) << 2) | ((lo) << 1)) +#define PREEMPT_GPGPU_MID_THREAD_LEVEL PREEMPT_GPGPU_LEVEL(0, 0) +#define PREEMPT_GPGPU_THREAD_GROUP_LEVEL PREEMPT_GPGPU_LEVEL(0, 1) +#define PREEMPT_GPGPU_COMMAND_LEVEL PREEMPT_GPGPU_LEVEL(1, 0) +#define PREEMPT_GPGPU_LEVEL_MASK PREEMPT_GPGPU_LEVEL(1, 1) +#define PREEMPT_3D_OBJECT_LEVEL REG_BIT(0) + +#define VDBOX_CGCTL3F08(base) XE_REG((base) + 0x3f08) +#define CG3DDISHRS_CLKGATE_DIS REG_BIT(5) + +#define VDBOX_CGCTL3F10(base) XE_REG((base) + 0x3f10) +#define IECPUNIT_CLKGATE_DIS REG_BIT(22) + +#define VDBOX_CGCTL3F18(base) XE_REG((base) + 0x3f18) +#define ALNUNIT_CLKGATE_DIS REG_BIT(13) + +#define VDBOX_CGCTL3F1C(base) XE_REG((base) + 0x3f1c) +#define MFXPIPE_CLKGATE_DIS REG_BIT(3) + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h new file mode 100644 index 000000000000..a255946b6f77 --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GPU_COMMANDS_H_ +#define _XE_GPU_COMMANDS_H_ + +#include "regs/xe_reg_defs.h" + +#define XY_CTRL_SURF_COPY_BLT ((2 << 29) | (0x48 << 22) | 3) +#define SRC_ACCESS_TYPE_SHIFT 21 +#define DST_ACCESS_TYPE_SHIFT 20 +#define CCS_SIZE_MASK GENMASK(17, 8) +#define XE2_CCS_SIZE_MASK GENMASK(18, 9) +#define XY_CTRL_SURF_MOCS_MASK GENMASK(31, 26) +#define XE2_XY_CTRL_SURF_MOCS_INDEX_MASK GENMASK(31, 28) +#define NUM_CCS_BYTES_PER_BLOCK 256 +#define NUM_BYTES_PER_CCS_BYTE(_xe) (GRAPHICS_VER(_xe) >= 20 ? 512 : 256) + +#define XY_FAST_COLOR_BLT_CMD (2 << 29 | 0x44 << 22) +#define XY_FAST_COLOR_BLT_DEPTH_32 (2 << 19) +#define XY_FAST_COLOR_BLT_DW 16 +#define XY_FAST_COLOR_BLT_MOCS_MASK GENMASK(27, 22) +#define XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK GENMASK(27, 24) +#define XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT 31 + +#define XY_FAST_COPY_BLT_CMD (2 << 29 | 0x42 << 22) +#define XY_FAST_COPY_BLT_DEPTH_32 (3<<24) +#define XY_FAST_COPY_BLT_D1_SRC_TILE4 REG_BIT(31) +#define XY_FAST_COPY_BLT_D1_DST_TILE4 REG_BIT(30) +#define XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK GENMASK(23, 20) + +#define PVC_MEM_SET_CMD (2 << 29 | 0x5b << 22) +#define PVC_MEM_SET_CMD_LEN_DW 7 +#define PVC_MEM_SET_MATRIX REG_BIT(17) +#define PVC_MEM_SET_DATA_FIELD GENMASK(31, 24) +/* Bspec lists field as [6:0], but index alone is from [6:1] */ +#define PVC_MEM_SET_MOCS_INDEX_MASK GENMASK(6, 1) +#define XE2_MEM_SET_MOCS_INDEX_MASK GENMASK(6, 3) + +#define GFX_OP_PIPE_CONTROL(len) ((0x3<<29)|(0x3<<27)|(0x2<<24)|((len)-2)) + +#define PIPE_CONTROL0_HDC_PIPELINE_FLUSH BIT(9) /* gen12 */ + +#define PIPE_CONTROL_COMMAND_CACHE_INVALIDATE (1<<29) +#define PIPE_CONTROL_TILE_CACHE_FLUSH (1<<28) +#define PIPE_CONTROL_AMFS_FLUSH (1<<25) +#define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24) +#define PIPE_CONTROL_LRI_POST_SYNC BIT(23) +#define PIPE_CONTROL_STORE_DATA_INDEX (1<<21) +#define PIPE_CONTROL_CS_STALL (1<<20) +#define PIPE_CONTROL_GLOBAL_SNAPSHOT_RESET (1<<19) +#define PIPE_CONTROL_TLB_INVALIDATE BIT(18) +#define PIPE_CONTROL_PSD_SYNC (1<<17) +#define PIPE_CONTROL_QW_WRITE (1<<14) +#define PIPE_CONTROL_DEPTH_STALL (1<<13) +#define PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH (1<<12) +#define PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE (1<<11) +#define PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE (1<<10) +#define PIPE_CONTROL_INDIRECT_STATE_DISABLE (1<<9) +#define PIPE_CONTROL_FLUSH_ENABLE (1<<7) +#define PIPE_CONTROL_DC_FLUSH_ENABLE (1<<5) +#define PIPE_CONTROL_VF_CACHE_INVALIDATE (1<<4) +#define PIPE_CONTROL_CONST_CACHE_INVALIDATE (1<<3) +#define PIPE_CONTROL_STATE_CACHE_INVALIDATE (1<<2) +#define PIPE_CONTROL_STALL_AT_SCOREBOARD (1<<1) +#define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0) + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_gsc_regs.h b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h new file mode 100644 index 000000000000..9886ec9cb08e --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_gsc_regs.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GSC_REGS_H_ +#define _XE_GSC_REGS_H_ + +#include <linux/compiler.h> +#include <linux/types.h> + +#include "regs/xe_reg_defs.h" + +/* Definitions of GSC H/W registers, bits, etc */ + +#define MTL_GSC_HECI1_BASE 0x00116000 +#define MTL_GSC_HECI2_BASE 0x00117000 + +#define HECI_H_CSR(base) XE_REG((base) + 0x4) +#define HECI_H_CSR_IE REG_BIT(0) +#define HECI_H_CSR_IS REG_BIT(1) +#define HECI_H_CSR_IG REG_BIT(2) +#define HECI_H_CSR_RDY REG_BIT(3) +#define HECI_H_CSR_RST REG_BIT(4) + +/* + * The FWSTS register values are FW defined and can be different between + * HECI1 and HECI2 + */ +#define HECI_FWSTS1(base) XE_REG((base) + 0xc40) +#define HECI1_FWSTS1_CURRENT_STATE REG_GENMASK(3, 0) +#define HECI1_FWSTS1_CURRENT_STATE_RESET 0 +#define HECI1_FWSTS1_PROXY_STATE_NORMAL 5 +#define HECI1_FWSTS1_INIT_COMPLETE REG_BIT(9) +#define HECI_FWSTS5(base) XE_REG((base) + 0xc68) +#define HECI1_FWSTS5_HUC_AUTH_DONE REG_BIT(19) + +#define HECI_H_GS1(base) XE_REG((base) + 0xc4c) +#define HECI_H_GS1_ER_PREP REG_BIT(0) + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h new file mode 100644 index 000000000000..1dd361046b5d --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h @@ -0,0 +1,478 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GT_REGS_H_ +#define _XE_GT_REGS_H_ + +#include "regs/xe_reg_defs.h" + +/* + * The GSI register range [0x0 - 0x40000) is replicated at a higher offset + * for the media GT. xe_mmio and xe_gt_mcr functions will automatically + * translate offsets by MEDIA_GT_GSI_OFFSET when operating on the media GT. + */ +#define MEDIA_GT_GSI_OFFSET 0x380000 +#define MEDIA_GT_GSI_LENGTH 0x40000 + +/* MTL workpoint reg to get core C state and actual freq of 3D, SAMedia */ +#define MTL_MIRROR_TARGET_WP1 XE_REG(0xc60) +#define MTL_CAGF_MASK REG_GENMASK(8, 0) +#define MTL_CC_MASK REG_GENMASK(12, 9) + +/* RPM unit config (Gen8+) */ +#define RPM_CONFIG0 XE_REG(0xd00) +#define RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_MASK REG_GENMASK(5, 3) +#define RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_24_MHZ 0 +#define RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_19_2_MHZ 1 +#define RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_38_4_MHZ 2 +#define RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_25_MHZ 3 +#define RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK REG_GENMASK(2, 1) + +#define FORCEWAKE_ACK_MEDIA_VDBOX(n) XE_REG(0xd50 + (n) * 4) +#define FORCEWAKE_ACK_MEDIA_VEBOX(n) XE_REG(0xd70 + (n) * 4) +#define FORCEWAKE_ACK_RENDER XE_REG(0xd84) + +#define GMD_ID XE_REG(0xd8c) +#define GMD_ID_ARCH_MASK REG_GENMASK(31, 22) +#define GMD_ID_RELEASE_MASK REG_GENMASK(21, 14) +#define GMD_ID_REVID REG_GENMASK(5, 0) + +#define FORCEWAKE_ACK_GSC XE_REG(0xdf8) +#define FORCEWAKE_ACK_GT_MTL XE_REG(0xdfc) + +#define MCFG_MCR_SELECTOR XE_REG(0xfd0) +#define MTL_MCR_SELECTOR XE_REG(0xfd4) +#define SF_MCR_SELECTOR XE_REG(0xfd8) +#define MCR_SELECTOR XE_REG(0xfdc) +#define GAM_MCR_SELECTOR XE_REG(0xfe0) +#define MCR_MULTICAST REG_BIT(31) +#define MCR_SLICE_MASK REG_GENMASK(30, 27) +#define MCR_SLICE(slice) REG_FIELD_PREP(MCR_SLICE_MASK, slice) +#define MCR_SUBSLICE_MASK REG_GENMASK(26, 24) +#define MCR_SUBSLICE(subslice) REG_FIELD_PREP(MCR_SUBSLICE_MASK, subslice) +#define MTL_MCR_GROUPID REG_GENMASK(11, 8) +#define MTL_MCR_INSTANCEID REG_GENMASK(3, 0) + +#define PS_INVOCATION_COUNT XE_REG(0x2348) + +#define XELP_GLOBAL_MOCS(i) XE_REG(0x4000 + (i) * 4) +#define XEHP_GLOBAL_MOCS(i) XE_REG_MCR(0x4000 + (i) * 4) +#define CCS_AUX_INV XE_REG(0x4208) + +#define VD0_AUX_INV XE_REG(0x4218) +#define VE0_AUX_INV XE_REG(0x4238) + +#define VE1_AUX_INV XE_REG(0x42b8) +#define AUX_INV REG_BIT(0) + +#define XEHP_TILE_ADDR_RANGE(_idx) XE_REG_MCR(0x4900 + (_idx) * 4) +#define XEHP_FLAT_CCS_BASE_ADDR XE_REG_MCR(0x4910) + +#define WM_CHICKEN3 XE_REG_MCR(0x5588, XE_REG_OPTION_MASKED) +#define HIZ_PLANE_COMPRESSION_DIS REG_BIT(10) + +#define CHICKEN_RASTER_2 XE_REG_MCR(0x6208, XE_REG_OPTION_MASKED) +#define TBIMR_FAST_CLIP REG_BIT(5) + +#define FF_MODE XE_REG_MCR(0x6210) +#define DIS_TE_AUTOSTRIP REG_BIT(31) +#define DIS_MESH_PARTIAL_AUTOSTRIP REG_BIT(16) +#define DIS_MESH_AUTOSTRIP REG_BIT(15) + +#define VFLSKPD XE_REG_MCR(0x62a8, XE_REG_OPTION_MASKED) +#define DIS_PARTIAL_AUTOSTRIP REG_BIT(9) +#define DIS_AUTOSTRIP REG_BIT(6) +#define DIS_OVER_FETCH_CACHE REG_BIT(1) +#define DIS_MULT_MISS_RD_SQUASH REG_BIT(0) + +#define FF_MODE2 XE_REG(0x6604) +#define XEHP_FF_MODE2 XE_REG_MCR(0x6604) +#define FF_MODE2_GS_TIMER_MASK REG_GENMASK(31, 24) +#define FF_MODE2_GS_TIMER_224 REG_FIELD_PREP(FF_MODE2_GS_TIMER_MASK, 224) +#define FF_MODE2_TDS_TIMER_MASK REG_GENMASK(23, 16) +#define FF_MODE2_TDS_TIMER_128 REG_FIELD_PREP(FF_MODE2_TDS_TIMER_MASK, 4) + +#define CACHE_MODE_1 XE_REG(0x7004, XE_REG_OPTION_MASKED) +#define MSAA_OPTIMIZATION_REDUC_DISABLE REG_BIT(11) + +#define COMMON_SLICE_CHICKEN1 XE_REG(0x7010) + +#define HIZ_CHICKEN XE_REG(0x7018, XE_REG_OPTION_MASKED) +#define DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE REG_BIT(14) +#define HZ_DEPTH_TEST_LE_GE_OPT_DISABLE REG_BIT(13) + +#define XEHP_PSS_MODE2 XE_REG_MCR(0x703c, XE_REG_OPTION_MASKED) +#define SCOREBOARD_STALL_FLUSH_CONTROL REG_BIT(5) + +#define XEHP_PSS_CHICKEN XE_REG_MCR(0x7044, XE_REG_OPTION_MASKED) +#define FLSH_IGNORES_PSD REG_BIT(10) +#define FD_END_COLLECT REG_BIT(5) + +#define COMMON_SLICE_CHICKEN4 XE_REG(0x7300, XE_REG_OPTION_MASKED) +#define DISABLE_TDC_LOAD_BALANCING_CALC REG_BIT(6) + +#define COMMON_SLICE_CHICKEN3 XE_REG(0x7304, XE_REG_OPTION_MASKED) +#define XEHP_COMMON_SLICE_CHICKEN3 XE_REG_MCR(0x7304, XE_REG_OPTION_MASKED) +#define DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN REG_BIT(12) +#define XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE REG_BIT(12) +#define BLEND_EMB_FIX_DISABLE_IN_RCC REG_BIT(11) +#define DISABLE_CPS_AWARE_COLOR_PIPE REG_BIT(9) + +#define XEHP_SLICE_COMMON_ECO_CHICKEN1 XE_REG_MCR(0x731c, XE_REG_OPTION_MASKED) +#define MSC_MSAA_REODER_BUF_BYPASS_DISABLE REG_BIT(14) + +#define VF_PREEMPTION XE_REG(0x83a4, XE_REG_OPTION_MASKED) +#define PREEMPTION_VERTEX_COUNT REG_GENMASK(15, 0) + +#define VF_SCRATCHPAD XE_REG(0x83a8, XE_REG_OPTION_MASKED) +#define XE2_VFG_TED_CREDIT_INTERFACE_DISABLE REG_BIT(13) + +#define VFG_PREEMPTION_CHICKEN XE_REG(0x83b4, XE_REG_OPTION_MASKED) +#define POLYGON_TRIFAN_LINELOOP_DISABLE REG_BIT(4) + +#define SQCNT1 XE_REG_MCR(0x8718) +#define XELPMP_SQCNT1 XE_REG(0x8718) +#define ENFORCE_RAR REG_BIT(23) + +#define XEHP_SQCM XE_REG_MCR(0x8724) +#define EN_32B_ACCESS REG_BIT(30) + +#define XE2_FLAT_CCS_BASE_RANGE_LOWER XE_REG_MCR(0x8800) +#define XE2_FLAT_CCS_ENABLE REG_BIT(0) + +#define GSCPSMI_BASE XE_REG(0x880c) + +/* Fuse readout registers for GT */ +#define XEHP_FUSE4 XE_REG(0x9114) +#define CCS_EN_MASK REG_GENMASK(19, 16) +#define GT_L3_EXC_MASK REG_GENMASK(6, 4) + +#define MIRROR_FUSE3 XE_REG(0x9118) +#define XE2_NODE_ENABLE_MASK REG_GENMASK(31, 16) +#define L3BANK_PAIR_COUNT 4 +#define L3BANK_MASK REG_GENMASK(3, 0) +/* on Xe_HP the same fuses indicates mslices instead of L3 banks */ +#define MAX_MSLICES 4 +#define MEML3_EN_MASK REG_GENMASK(3, 0) + +#define XELP_EU_ENABLE XE_REG(0x9134) /* "_DISABLE" on Xe_LP */ +#define XELP_EU_MASK REG_GENMASK(7, 0) +#define XELP_GT_GEOMETRY_DSS_ENABLE XE_REG(0x913c) + +#define GT_VEBOX_VDBOX_DISABLE XE_REG(0x9140) +#define GT_VEBOX_DISABLE_MASK REG_GENMASK(19, 16) +#define GT_VDBOX_DISABLE_MASK REG_GENMASK(7, 0) + +#define XEHP_GT_COMPUTE_DSS_ENABLE XE_REG(0x9144) +#define XEHPC_GT_COMPUTE_DSS_ENABLE_EXT XE_REG(0x9148) +#define XE2_GT_COMPUTE_DSS_2 XE_REG(0x914c) +#define XE2_GT_GEOMETRY_DSS_1 XE_REG(0x9150) +#define XE2_GT_GEOMETRY_DSS_2 XE_REG(0x9154) + +#define GDRST XE_REG(0x941c) +#define GRDOM_GUC REG_BIT(3) +#define GRDOM_FULL REG_BIT(0) + +#define MISCCPCTL XE_REG(0x9424) +#define DOP_CLOCK_GATE_RENDER_ENABLE REG_BIT(1) + +#define UNSLCGCTL9430 XE_REG(0x9430) +#define MSQDUNIT_CLKGATE_DIS REG_BIT(3) + +#define UNSLICE_UNIT_LEVEL_CLKGATE XE_REG(0x9434) +#define VFUNIT_CLKGATE_DIS REG_BIT(20) +#define TSGUNIT_CLKGATE_DIS REG_BIT(17) /* XEHPSDV */ +#define CG3DDISCFEG_CLKGATE_DIS REG_BIT(17) /* DG2 */ +#define GAMEDIA_CLKGATE_DIS REG_BIT(11) +#define HSUNIT_CLKGATE_DIS REG_BIT(8) +#define VSUNIT_CLKGATE_DIS REG_BIT(3) + +#define UNSLCGCTL9440 XE_REG(0x9440) +#define GAMTLBOACS_CLKGATE_DIS REG_BIT(28) +#define GAMTLBVDBOX5_CLKGATE_DIS REG_BIT(27) +#define GAMTLBVDBOX6_CLKGATE_DIS REG_BIT(26) +#define GAMTLBVDBOX3_CLKGATE_DIS REG_BIT(24) +#define GAMTLBVDBOX4_CLKGATE_DIS REG_BIT(23) +#define GAMTLBVDBOX7_CLKGATE_DIS REG_BIT(22) +#define GAMTLBVDBOX2_CLKGATE_DIS REG_BIT(21) +#define GAMTLBVDBOX0_CLKGATE_DIS REG_BIT(17) +#define GAMTLBKCR_CLKGATE_DIS REG_BIT(16) +#define GAMTLBGUC_CLKGATE_DIS REG_BIT(15) +#define GAMTLBBLT_CLKGATE_DIS REG_BIT(14) +#define GAMTLBVDBOX1_CLKGATE_DIS REG_BIT(6) + +#define UNSLCGCTL9444 XE_REG(0x9444) +#define GAMTLBGFXA0_CLKGATE_DIS REG_BIT(30) +#define GAMTLBGFXA1_CLKGATE_DIS REG_BIT(29) +#define GAMTLBCOMPA0_CLKGATE_DIS REG_BIT(28) +#define GAMTLBCOMPA1_CLKGATE_DIS REG_BIT(27) +#define GAMTLBCOMPB0_CLKGATE_DIS REG_BIT(26) +#define GAMTLBCOMPB1_CLKGATE_DIS REG_BIT(25) +#define GAMTLBCOMPC0_CLKGATE_DIS REG_BIT(24) +#define GAMTLBCOMPC1_CLKGATE_DIS REG_BIT(23) +#define GAMTLBCOMPD0_CLKGATE_DIS REG_BIT(22) +#define GAMTLBCOMPD1_CLKGATE_DIS REG_BIT(21) +#define GAMTLBMERT_CLKGATE_DIS REG_BIT(20) +#define GAMTLBVEBOX3_CLKGATE_DIS REG_BIT(19) +#define GAMTLBVEBOX2_CLKGATE_DIS REG_BIT(18) +#define GAMTLBVEBOX1_CLKGATE_DIS REG_BIT(17) +#define GAMTLBVEBOX0_CLKGATE_DIS REG_BIT(16) +#define LTCDD_CLKGATE_DIS REG_BIT(10) + +#define XEHP_SLICE_UNIT_LEVEL_CLKGATE XE_REG_MCR(0x94d4) +#define L3_CR2X_CLKGATE_DIS REG_BIT(17) +#define L3_CLKGATE_DIS REG_BIT(16) +#define NODEDSS_CLKGATE_DIS REG_BIT(12) +#define MSCUNIT_CLKGATE_DIS REG_BIT(10) +#define RCCUNIT_CLKGATE_DIS REG_BIT(7) +#define SARBUNIT_CLKGATE_DIS REG_BIT(5) +#define SBEUNIT_CLKGATE_DIS REG_BIT(4) + +#define UNSLICE_UNIT_LEVEL_CLKGATE2 XE_REG(0x94e4) +#define VSUNIT_CLKGATE2_DIS REG_BIT(19) + +#define SUBSLICE_UNIT_LEVEL_CLKGATE XE_REG_MCR(0x9524) +#define DSS_ROUTER_CLKGATE_DIS REG_BIT(28) +#define GWUNIT_CLKGATE_DIS REG_BIT(16) + +#define SUBSLICE_UNIT_LEVEL_CLKGATE2 XE_REG_MCR(0x9528) +#define CPSSUNIT_CLKGATE_DIS REG_BIT(9) + +#define SSMCGCTL9530 XE_REG_MCR(0x9530) +#define RTFUNIT_CLKGATE_DIS REG_BIT(18) + +#define DFR_RATIO_EN_AND_CHICKEN XE_REG_MCR(0x9550) +#define DFR_DISABLE REG_BIT(9) + +#define RPNSWREQ XE_REG(0xa008) +#define REQ_RATIO_MASK REG_GENMASK(31, 23) + +#define RP_CONTROL XE_REG(0xa024) +#define RPSWCTL_MASK REG_GENMASK(10, 9) +#define RPSWCTL_ENABLE REG_FIELD_PREP(RPSWCTL_MASK, 2) +#define RPSWCTL_DISABLE REG_FIELD_PREP(RPSWCTL_MASK, 0) +#define RC_CONTROL XE_REG(0xa090) +#define RC_CTL_HW_ENABLE REG_BIT(31) +#define RC_CTL_TO_MODE REG_BIT(28) +#define RC_CTL_RC6_ENABLE REG_BIT(18) +#define RC_STATE XE_REG(0xa094) +#define RC_IDLE_HYSTERSIS XE_REG(0xa0ac) + +#define PMINTRMSK XE_REG(0xa168) +#define PMINTR_DISABLE_REDIRECT_TO_GUC REG_BIT(31) +#define ARAT_EXPIRED_INTRMSK REG_BIT(9) + +#define FORCEWAKE_GT XE_REG(0xa188) + +#define PG_ENABLE XE_REG(0xa210) + +#define CTC_MODE XE_REG(0xa26c) +#define CTC_SHIFT_PARAMETER_MASK REG_GENMASK(2, 1) +#define CTC_SOURCE_DIVIDE_LOGIC REG_BIT(0) + +#define FORCEWAKE_RENDER XE_REG(0xa278) +#define FORCEWAKE_MEDIA_VDBOX(n) XE_REG(0xa540 + (n) * 4) +#define FORCEWAKE_MEDIA_VEBOX(n) XE_REG(0xa560 + (n) * 4) +#define FORCEWAKE_GSC XE_REG(0xa618) + +#define XEHPC_LNCFMISCCFGREG0 XE_REG_MCR(0xb01c, XE_REG_OPTION_MASKED) +#define XEHPC_OVRLSCCC REG_BIT(0) + +/* L3 Cache Control */ +#define XELP_LNCFCMOCS(i) XE_REG(0xb020 + (i) * 4) +#define XEHP_LNCFCMOCS(i) XE_REG_MCR(0xb020 + (i) * 4) +#define LNCFCMOCS_REG_COUNT 32 + +#define XEHP_L3NODEARBCFG XE_REG_MCR(0xb0b4) +#define XEHP_LNESPARE REG_BIT(19) + +#define XEHP_L3SQCREG5 XE_REG_MCR(0xb158) +#define L3_PWM_TIMER_INIT_VAL_MASK REG_GENMASK(9, 0) + +#define XEHP_L3SCQREG7 XE_REG_MCR(0xb188) +#define BLEND_FILL_CACHING_OPT_DIS REG_BIT(3) + +#define XEHPC_L3CLOS_MASK(i) XE_REG_MCR(0xb194 + (i) * 8) + +#define XE2LPM_L3SQCREG5 XE_REG_MCR(0xb658) + +#define XEHP_MERT_MOD_CTRL XE_REG_MCR(0xcf28) +#define RENDER_MOD_CTRL XE_REG_MCR(0xcf2c) +#define COMP_MOD_CTRL XE_REG_MCR(0xcf30) +#define XEHP_VDBX_MOD_CTRL XE_REG_MCR(0xcf34) +#define XELPMP_VDBX_MOD_CTRL XE_REG(0xcf34) +#define XEHP_VEBX_MOD_CTRL XE_REG_MCR(0xcf38) +#define XELPMP_VEBX_MOD_CTRL XE_REG(0xcf38) +#define FORCE_MISS_FTLB REG_BIT(3) + +#define XEHP_GAMSTLB_CTRL XE_REG_MCR(0xcf4c) +#define CONTROL_BLOCK_CLKGATE_DIS REG_BIT(12) +#define EGRESS_BLOCK_CLKGATE_DIS REG_BIT(11) +#define TAG_BLOCK_CLKGATE_DIS REG_BIT(7) + +#define XEHP_GAMCNTRL_CTRL XE_REG_MCR(0xcf54) +#define INVALIDATION_BROADCAST_MODE_DIS REG_BIT(12) +#define GLOBAL_INVALIDATION_MODE REG_BIT(2) + +#define HALF_SLICE_CHICKEN5 XE_REG_MCR(0xe188, XE_REG_OPTION_MASKED) +#define DISABLE_SAMPLE_G_PERFORMANCE REG_BIT(0) + +#define SAMPLER_MODE XE_REG_MCR(0xe18c, XE_REG_OPTION_MASKED) +#define ENABLE_SMALLPL REG_BIT(15) +#define SC_DISABLE_POWER_OPTIMIZATION_EBB REG_BIT(9) +#define SAMPLER_ENABLE_HEADLESS_MSG REG_BIT(5) +#define INDIRECT_STATE_BASE_ADDR_OVERRIDE REG_BIT(0) + +#define HALF_SLICE_CHICKEN7 XE_REG_MCR(0xe194, XE_REG_OPTION_MASKED) +#define DG2_DISABLE_ROUND_ENABLE_ALLOW_FOR_SSLA REG_BIT(15) + +#define CACHE_MODE_SS XE_REG_MCR(0xe420, XE_REG_OPTION_MASKED) +#define DISABLE_ECC REG_BIT(5) +#define ENABLE_PREFETCH_INTO_IC REG_BIT(3) + +#define ROW_CHICKEN4 XE_REG_MCR(0xe48c, XE_REG_OPTION_MASKED) +#define DISABLE_GRF_CLEAR REG_BIT(13) +#define XEHP_DIS_BBL_SYSPIPE REG_BIT(11) +#define DISABLE_TDL_PUSH REG_BIT(9) +#define DIS_PICK_2ND_EU REG_BIT(7) +#define DISABLE_HDR_PAST_PAYLOAD_HOLD_FIX REG_BIT(4) +#define THREAD_EX_ARB_MODE REG_GENMASK(3, 2) +#define THREAD_EX_ARB_MODE_RR_AFTER_DEP REG_FIELD_PREP(THREAD_EX_ARB_MODE, 0x2) + +#define ROW_CHICKEN3 XE_REG_MCR(0xe49c, XE_REG_OPTION_MASKED) +#define DIS_FIX_EOT1_FLUSH REG_BIT(9) + +#define ROW_CHICKEN XE_REG_MCR(0xe4f0, XE_REG_OPTION_MASKED) +#define UGM_BACKUP_MODE REG_BIT(13) +#define MDQ_ARBITRATION_MODE REG_BIT(12) +#define EARLY_EOT_DIS REG_BIT(1) + +#define ROW_CHICKEN2 XE_REG_MCR(0xe4f4, XE_REG_OPTION_MASKED) +#define DISABLE_READ_SUPPRESSION REG_BIT(15) +#define DISABLE_EARLY_READ REG_BIT(14) +#define ENABLE_LARGE_GRF_MODE REG_BIT(12) +#define PUSH_CONST_DEREF_HOLD_DIS REG_BIT(8) +#define DISABLE_DOP_GATING REG_BIT(0) + +#define RT_CTRL XE_REG_MCR(0xe530) +#define DIS_NULL_QUERY REG_BIT(10) + +#define XEHP_HDC_CHICKEN0 XE_REG_MCR(0xe5f0, XE_REG_OPTION_MASKED) +#define LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK REG_GENMASK(13, 11) +#define DIS_ATOMIC_CHAINING_TYPED_WRITES REG_BIT(3) + +#define LSC_CHICKEN_BIT_0 XE_REG_MCR(0xe7c8) +#define DISABLE_D8_D16_COASLESCE REG_BIT(30) +#define TGM_WRITE_EOM_FORCE REG_BIT(17) +#define FORCE_1_SUB_MESSAGE_PER_FRAGMENT REG_BIT(15) +#define SEQUENTIAL_ACCESS_UPGRADE_DISABLE REG_BIT(13) + +#define LSC_CHICKEN_BIT_0_UDW XE_REG_MCR(0xe7c8 + 4) +#define UGM_FRAGMENT_THRESHOLD_TO_3 REG_BIT(58 - 32) +#define DIS_CHAIN_2XSIMD8 REG_BIT(55 - 32) +#define XE2_ALLOC_DPA_STARVE_FIX_DIS REG_BIT(47 - 32) +#define ENABLE_SMP_LD_RENDER_SURFACE_CONTROL REG_BIT(44 - 32) +#define FORCE_SLM_FENCE_SCOPE_TO_TILE REG_BIT(42 - 32) +#define FORCE_UGM_FENCE_SCOPE_TO_TILE REG_BIT(41 - 32) +#define MAXREQS_PER_BANK REG_GENMASK(39 - 32, 37 - 32) +#define DISABLE_128B_EVICTION_COMMAND_UDW REG_BIT(36 - 32) + +#define SARB_CHICKEN1 XE_REG_MCR(0xe90c) +#define COMP_CKN_IN REG_GENMASK(30, 29) + +#define RCU_MODE XE_REG(0x14800, XE_REG_OPTION_MASKED) +#define RCU_MODE_FIXED_SLICE_CCS_MODE REG_BIT(1) +#define RCU_MODE_CCS_ENABLE REG_BIT(0) + +/* + * Total of 4 cslices, where each cslice is in the form: + * [0-3] CCS ID + * [4-6] RSVD + * [7] Disabled + */ +#define CCS_MODE XE_REG(0x14804) +#define CCS_MODE_CSLICE_0_3_MASK REG_GENMASK(11, 0) /* 3 bits per cslice */ +#define CCS_MODE_CSLICE_MASK 0x7 /* CCS0-3 + rsvd */ +#define CCS_MODE_CSLICE_WIDTH ilog2(CCS_MODE_CSLICE_MASK + 1) +#define CCS_MODE_CSLICE(cslice, ccs) \ + ((ccs) << ((cslice) * CCS_MODE_CSLICE_WIDTH)) + +#define FORCEWAKE_ACK_GT XE_REG(0x130044) +#define FORCEWAKE_KERNEL BIT(0) +#define FORCEWAKE_USER BIT(1) +#define FORCEWAKE_KERNEL_FALLBACK BIT(15) + +#define MTL_MEDIA_PERF_LIMIT_REASONS XE_REG(0x138030) +#define MTL_MEDIA_MC6 XE_REG(0x138048) + +#define GT_CORE_STATUS XE_REG(0x138060) +#define RCN_MASK REG_GENMASK(2, 0) +#define GT_C0 0 +#define GT_C6 3 + +#define GT_GFX_RC6_LOCKED XE_REG(0x138104) +#define GT_GFX_RC6 XE_REG(0x138108) + +#define GT0_PERF_LIMIT_REASONS XE_REG(0x1381a8) +#define GT0_PERF_LIMIT_REASONS_MASK 0xde3 +#define PROCHOT_MASK REG_BIT(0) +#define THERMAL_LIMIT_MASK REG_BIT(1) +#define RATL_MASK REG_BIT(5) +#define VR_THERMALERT_MASK REG_BIT(6) +#define VR_TDC_MASK REG_BIT(7) +#define POWER_LIMIT_4_MASK REG_BIT(8) +#define POWER_LIMIT_1_MASK REG_BIT(10) +#define POWER_LIMIT_2_MASK REG_BIT(11) + +#define GT_PERF_STATUS XE_REG(0x1381b4) +#define VOLTAGE_MASK REG_GENMASK(10, 0) + +#define GT_INTR_DW(x) XE_REG(0x190018 + ((x) * 4)) + +#define RENDER_COPY_INTR_ENABLE XE_REG(0x190030) +#define VCS_VECS_INTR_ENABLE XE_REG(0x190034) +#define GUC_SG_INTR_ENABLE XE_REG(0x190038) +#define ENGINE1_MASK REG_GENMASK(31, 16) +#define ENGINE0_MASK REG_GENMASK(15, 0) +#define GPM_WGBOXPERF_INTR_ENABLE XE_REG(0x19003c) +#define GUNIT_GSC_INTR_ENABLE XE_REG(0x190044) +#define CCS_RSVD_INTR_ENABLE XE_REG(0x190048) + +#define INTR_IDENTITY_REG(x) XE_REG(0x190060 + ((x) * 4)) +#define INTR_DATA_VALID REG_BIT(31) +#define INTR_ENGINE_INSTANCE(x) REG_FIELD_GET(GENMASK(25, 20), x) +#define INTR_ENGINE_CLASS(x) REG_FIELD_GET(GENMASK(18, 16), x) +#define INTR_ENGINE_INTR(x) REG_FIELD_GET(GENMASK(15, 0), x) +#define OTHER_GUC_INSTANCE 0 +#define OTHER_GSC_INSTANCE 6 + +#define IIR_REG_SELECTOR(x) XE_REG(0x190070 + ((x) * 4)) +#define RCS0_RSVD_INTR_MASK XE_REG(0x190090) +#define BCS_RSVD_INTR_MASK XE_REG(0x1900a0) +#define VCS0_VCS1_INTR_MASK XE_REG(0x1900a8) +#define VCS2_VCS3_INTR_MASK XE_REG(0x1900ac) +#define VECS0_VECS1_INTR_MASK XE_REG(0x1900d0) +#define GUC_SG_INTR_MASK XE_REG(0x1900e8) +#define GPM_WGBOXPERF_INTR_MASK XE_REG(0x1900ec) +#define GUNIT_GSC_INTR_MASK XE_REG(0x1900f4) +#define CCS0_CCS1_INTR_MASK XE_REG(0x190100) +#define CCS2_CCS3_INTR_MASK XE_REG(0x190104) +#define XEHPC_BCS1_BCS2_INTR_MASK XE_REG(0x190110) +#define XEHPC_BCS3_BCS4_INTR_MASK XE_REG(0x190114) +#define XEHPC_BCS5_BCS6_INTR_MASK XE_REG(0x190118) +#define XEHPC_BCS7_BCS8_INTR_MASK XE_REG(0x19011c) +#define GT_WAIT_SEMAPHORE_INTERRUPT REG_BIT(11) +#define GT_CONTEXT_SWITCH_INTERRUPT REG_BIT(8) +#define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT REG_BIT(4) +#define GT_CS_MASTER_ERROR_INTERRUPT REG_BIT(3) +#define GT_RENDER_USER_INTERRUPT REG_BIT(0) + +#define PVC_GT0_PACKAGE_ENERGY_STATUS XE_REG(0x281004) +#define PVC_GT0_PACKAGE_RAPL_LIMIT XE_REG(0x281008) +#define PVC_GT0_PACKAGE_POWER_SKU_UNIT XE_REG(0x281068) +#define PVC_GT0_PLATFORM_ENERGY_STATUS XE_REG(0x28106c) +#define PVC_GT0_PACKAGE_POWER_SKU XE_REG(0x281080) + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h b/drivers/gpu/drm/xe/regs/xe_guc_regs.h new file mode 100644 index 000000000000..92320bbc9d3d --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_REGS_H_ +#define _XE_GUC_REGS_H_ + +#include <linux/compiler.h> +#include <linux/types.h> + +#include "regs/xe_reg_defs.h" + +/* Definitions of GuC H/W registers, bits, etc */ + +#define DIST_DBS_POPULATED XE_REG(0xd08) +#define DOORBELLS_PER_SQIDI_MASK REG_GENMASK(23, 16) +#define SQIDIS_DOORBELL_EXIST_MASK REG_GENMASK(15, 0) + +#define DRBREGL(x) XE_REG(0x1000 + (x) * 8) +#define DRB_VALID REG_BIT(0) +#define DRBREGU(x) XE_REG(0x1000 + (x) * 8 + 4) + +#define GTCR XE_REG(0x4274) +#define GTCR_INVALIDATE REG_BIT(0) + +#define GUC_ARAT_C6DIS XE_REG(0xa178) + +#define GUC_STATUS XE_REG(0xc000) +#define GS_AUTH_STATUS_MASK REG_GENMASK(31, 30) +#define GS_AUTH_STATUS_BAD REG_FIELD_PREP(GS_AUTH_STATUS_MASK, 0x1) +#define GS_AUTH_STATUS_GOOD REG_FIELD_PREP(GS_AUTH_STATUS_MASK, 0x2) +#define GS_MIA_MASK REG_GENMASK(18, 16) +#define GS_MIA_CORE_STATE REG_FIELD_PREP(GS_MIA_MASK, 0x1) +#define GS_MIA_HALT_REQUESTED REG_FIELD_PREP(GS_MIA_MASK, 0x2) +#define GS_MIA_ISR_ENTRY REG_FIELD_PREP(GS_MIA_MASK, 0x4) +#define GS_UKERNEL_MASK REG_GENMASK(15, 8) +#define GS_BOOTROM_MASK REG_GENMASK(7, 1) +#define GS_BOOTROM_RSA_FAILED REG_FIELD_PREP(GS_BOOTROM_MASK, 0x50) +#define GS_BOOTROM_JUMP_PASSED REG_FIELD_PREP(GS_BOOTROM_MASK, 0x76) +#define GS_MIA_IN_RESET REG_BIT(0) + +#define GUC_WOPCM_SIZE XE_REG(0xc050) +#define GUC_WOPCM_SIZE_MASK REG_GENMASK(31, 12) +#define GUC_WOPCM_SIZE_LOCKED REG_BIT(0) + +#define GUC_SHIM_CONTROL XE_REG(0xc064) +#define GUC_MOCS_INDEX_MASK REG_GENMASK(27, 24) +#define GUC_SHIM_WC_ENABLE REG_BIT(21) +#define GUC_ENABLE_MIA_CLOCK_GATING REG_BIT(15) +#define GUC_ENABLE_READ_CACHE_FOR_WOPCM_DATA REG_BIT(10) +#define GUC_ENABLE_READ_CACHE_FOR_SRAM_DATA REG_BIT(9) +#define GUC_MSGCH_ENABLE REG_BIT(4) +#define GUC_ENABLE_MIA_CACHING REG_BIT(2) +#define GUC_ENABLE_READ_CACHE_LOGIC REG_BIT(1) +#define GUC_DISABLE_SRAM_INIT_TO_ZEROES REG_BIT(0) + +#define SOFT_SCRATCH(n) XE_REG(0xc180 + (n) * 4) +#define SOFT_SCRATCH_COUNT 16 + +#define HUC_KERNEL_LOAD_INFO XE_REG(0xc1dc) +#define HUC_LOAD_SUCCESSFUL REG_BIT(0) + +#define UOS_RSA_SCRATCH(i) XE_REG(0xc200 + (i) * 4) +#define UOS_RSA_SCRATCH_COUNT 64 + +#define DMA_ADDR_0_LOW XE_REG(0xc300) +#define DMA_ADDR_0_HIGH XE_REG(0xc304) +#define DMA_ADDR_1_LOW XE_REG(0xc308) +#define DMA_ADDR_1_HIGH XE_REG(0xc30c) +#define DMA_ADDR_SPACE_MASK REG_GENMASK(20, 16) +#define DMA_ADDRESS_SPACE_WOPCM REG_FIELD_PREP(DMA_ADDR_SPACE_MASK, 7) +#define DMA_ADDRESS_SPACE_GGTT REG_FIELD_PREP(DMA_ADDR_SPACE_MASK, 8) +#define DMA_COPY_SIZE XE_REG(0xc310) +#define DMA_CTRL XE_REG(0xc314) +#define HUC_UKERNEL REG_BIT(9) +#define UOS_MOVE REG_BIT(4) +#define START_DMA REG_BIT(0) +#define DMA_GUC_WOPCM_OFFSET XE_REG(0xc340) +#define GUC_WOPCM_OFFSET_SHIFT 14 +#define GUC_WOPCM_OFFSET_MASK REG_GENMASK(31, GUC_WOPCM_OFFSET_SHIFT) +#define HUC_LOADING_AGENT_GUC REG_BIT(1) +#define GUC_WOPCM_OFFSET_VALID REG_BIT(0) +#define GUC_MAX_IDLE_COUNT XE_REG(0xc3e4) + +#define GUC_SEND_INTERRUPT XE_REG(0xc4c8) +#define GUC_SEND_TRIGGER REG_BIT(0) + +#define GUC_BCS_RCS_IER XE_REG(0xc550) +#define GUC_VCS2_VCS1_IER XE_REG(0xc554) +#define GUC_WD_VECS_IER XE_REG(0xc558) +#define GUC_PM_P24C_IER XE_REG(0xc55c) + +#define GUC_TLB_INV_CR XE_REG(0xcee8) +#define GUC_TLB_INV_CR_INVALIDATE REG_BIT(0) + +#define HUC_STATUS2 XE_REG(0xd3b0) +#define HUC_FW_VERIFIED REG_BIT(7) + +#define GT_PM_CONFIG XE_REG(0x13816c) +#define GT_DOORBELL_ENABLE REG_BIT(0) + +#define GUC_HOST_INTERRUPT XE_REG(0x1901f0) + +#define VF_SW_FLAG(n) XE_REG(0x190240 + (n) * 4) +#define VF_SW_FLAG_COUNT 4 + +#define MED_GUC_HOST_INTERRUPT XE_REG(0x190304) + +#define MED_VF_SW_FLAG(n) XE_REG(0x190310 + (n) * 4) +#define MED_VF_SW_FLAG_COUNT 4 + +/* GuC Interrupt Vector */ +#define GUC_INTR_GUC2HOST REG_BIT(15) +#define GUC_INTR_EXEC_ERROR REG_BIT(14) +#define GUC_INTR_DISPLAY_EVENT REG_BIT(13) +#define GUC_INTR_SEM_SIG REG_BIT(12) +#define GUC_INTR_IOMMU2GUC REG_BIT(11) +#define GUC_INTR_DOORBELL_RANG REG_BIT(10) +#define GUC_INTR_DMA_DONE REG_BIT(9) +#define GUC_INTR_FATAL_ERROR REG_BIT(8) +#define GUC_INTR_NOTIF_ERROR REG_BIT(7) +#define GUC_INTR_SW_INT_6 REG_BIT(6) +#define GUC_INTR_SW_INT_5 REG_BIT(5) +#define GUC_INTR_SW_INT_4 REG_BIT(4) +#define GUC_INTR_SW_INT_3 REG_BIT(3) +#define GUC_INTR_SW_INT_2 REG_BIT(2) +#define GUC_INTR_SW_INT_1 REG_BIT(1) +#define GUC_INTR_SW_INT_0 REG_BIT(0) + +#define GUC_NUM_DOORBELLS 256 + +/* format of the HW-monitored doorbell cacheline */ +struct guc_doorbell_info { + u32 db_status; +#define GUC_DOORBELL_DISABLED 0 +#define GUC_DOORBELL_ENABLED 1 + + u32 cookie; + u32 reserved[14]; +} __packed; + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h new file mode 100644 index 000000000000..4be81abc86ad --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_LRC_LAYOUT_H_ +#define _XE_LRC_LAYOUT_H_ + +#define CTX_CONTEXT_CONTROL (0x02 + 1) +#define CTX_RING_HEAD (0x04 + 1) +#define CTX_RING_TAIL (0x06 + 1) +#define CTX_RING_START (0x08 + 1) +#define CTX_RING_CTL (0x0a + 1) +#define CTX_PDP0_UDW (0x30 + 1) +#define CTX_PDP0_LDW (0x32 + 1) + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h new file mode 100644 index 000000000000..519dd1067a19 --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_mchbar_regs.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_MCHBAR_REGS_H_ +#define _XE_MCHBAR_REGS_H_ + +#include "regs/xe_reg_defs.h" + +/* + * MCHBAR mirror. + * + * This mirrors the MCHBAR MMIO space whose location is determined by + * device 0 function 0's pci config register 0x44 or 0x48 and matches it in + * every way. + */ + +#define MCHBAR_MIRROR_BASE_SNB 0x140000 + +#define PCU_CR_PACKAGE_POWER_SKU XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x5930) +#define PKG_TDP GENMASK_ULL(14, 0) +#define PKG_MIN_PWR GENMASK_ULL(30, 16) +#define PKG_MAX_PWR GENMASK_ULL(46, 32) +#define PKG_MAX_WIN GENMASK_ULL(54, 48) +#define PKG_MAX_WIN_X GENMASK_ULL(54, 53) +#define PKG_MAX_WIN_Y GENMASK_ULL(52, 48) + + +#define PCU_CR_PACKAGE_POWER_SKU_UNIT XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x5938) +#define PKG_PWR_UNIT REG_GENMASK(3, 0) +#define PKG_ENERGY_UNIT REG_GENMASK(12, 8) +#define PKG_TIME_UNIT REG_GENMASK(19, 16) + +#define PCU_CR_PACKAGE_ENERGY_STATUS XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x593c) + +#define PCU_CR_PACKAGE_RAPL_LIMIT XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x59a0) +#define PKG_PWR_LIM_1 REG_GENMASK(14, 0) +#define PKG_PWR_LIM_1_EN REG_BIT(15) +#define PKG_PWR_LIM_1_TIME REG_GENMASK(23, 17) +#define PKG_PWR_LIM_1_TIME_X REG_GENMASK(23, 22) +#define PKG_PWR_LIM_1_TIME_Y REG_GENMASK(21, 17) + +#endif /* _XE_MCHBAR_REGS_H_ */ diff --git a/drivers/gpu/drm/xe/regs/xe_reg_defs.h b/drivers/gpu/drm/xe/regs/xe_reg_defs.h new file mode 100644 index 000000000000..c50e7650c09a --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_reg_defs.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_REG_DEFS_H_ +#define _XE_REG_DEFS_H_ + +#include "compat-i915-headers/i915_reg_defs.h" + +/** + * struct xe_reg - Register definition + * + * Register defintion to be used by the individual register. Although the same + * definition is used for xe_reg and xe_reg_mcr, they use different internal + * APIs for accesses. + */ +struct xe_reg { + union { + struct { + /** @addr: address */ + u32 addr:28; + /** + * @masked: register is "masked", with upper 16bits used + * to identify the bits that are updated on the lower + * bits + */ + u32 masked:1; + /** + * @mcr: register is multicast/replicated in the + * hardware and needs special handling. Any register + * with this set should also use a type of xe_reg_mcr_t. + * It's only here so the few places that deal with MCR + * registers specially (xe_sr.c) and tests using the raw + * value can inspect it. + */ + u32 mcr:1; + /** + * @ext: access MMIO extension space for current register. + */ + u32 ext:1; + }; + /** @raw: Raw value with both address and options */ + u32 raw; + }; +}; + +/** + * struct xe_reg_mcr - MCR register definition + * + * MCR register is the same as a regular register, but uses another type since + * the internal API used for accessing them is different: it's never correct to + * use regular MMIO access. + */ +struct xe_reg_mcr { + /** @__reg: The register */ + struct xe_reg __reg; +}; + + +/** + * XE_REG_OPTION_MASKED - Register is "masked", with upper 16 bits marking the + * written bits on the lower 16 bits. + * + * It only applies to registers explicitly marked in bspec with + * "Access: Masked". Registers with this option can have write operations to + * specific lower bits by setting the corresponding upper bits. Other bits will + * not be affected. This allows register writes without needing a RMW cycle and + * without caching in software the register value. + * + * Example: a write with value 0x00010001 will set bit 0 and all other bits + * retain their previous values. + * + * To be used with XE_REG(). XE_REG_MCR() and XE_REG_INITIALIZER() + */ +#define XE_REG_OPTION_MASKED .masked = 1 + +/** + * XE_REG_INITIALIZER - Initializer for xe_reg_t. + * @r_: Register offset + * @...: Additional options like access mode. See struct xe_reg for available + * options. + * + * Register field is mandatory, and additional options may be passed as + * arguments. Usually ``XE_REG()`` should be preferred since it creates an + * object of the right type. However when initializing static const storage, + * where a compound statement is not allowed, this can be used instead. + */ +#define XE_REG_INITIALIZER(r_, ...) { .addr = r_, __VA_ARGS__ } + + +/** + * XE_REG - Create a struct xe_reg from offset and additional flags + * @r_: Register offset + * @...: Additional options like access mode. See struct xe_reg for available + * options. + */ +#define XE_REG(r_, ...) ((const struct xe_reg)XE_REG_INITIALIZER(r_, ##__VA_ARGS__)) + +/** + * XE_REG_EXT - Create a struct xe_reg from extension offset and additional + * flags + * @r_: Register extension offset + * @...: Additional options like access mode. See struct xe_reg for available + * options. + */ +#define XE_REG_EXT(r_, ...) \ + ((const struct xe_reg)XE_REG_INITIALIZER(r_, ##__VA_ARGS__, .ext = 1)) + +/** + * XE_REG_MCR - Create a struct xe_reg_mcr from offset and additional flags + * @r_: Register offset + * @...: Additional options like access mode. See struct xe_reg for available + * options. + */ +#define XE_REG_MCR(r_, ...) ((const struct xe_reg_mcr){ \ + .__reg = XE_REG_INITIALIZER(r_, ##__VA_ARGS__, .mcr = 1) \ + }) + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h new file mode 100644 index 000000000000..2c214bb9b671 --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_regs.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ +#ifndef _XE_REGS_H_ +#define _XE_REGS_H_ + +#include "regs/xe_reg_defs.h" + +#define TIMESTAMP_OVERRIDE XE_REG(0x44074) +#define TIMESTAMP_OVERRIDE_US_COUNTER_DENOMINATOR_MASK REG_GENMASK(15, 12) +#define TIMESTAMP_OVERRIDE_US_COUNTER_DIVIDER_MASK REG_GENMASK(9, 0) + +#define PCU_IRQ_OFFSET 0x444e0 +#define GU_MISC_IRQ_OFFSET 0x444f0 +#define GU_MISC_GSE REG_BIT(27) + +#define SOFTWARE_FLAGS_SPR33 XE_REG(0x4f084) + +#define GU_CNTL_PROTECTED XE_REG(0x10100C) +#define DRIVERINT_FLR_DIS REG_BIT(31) + +#define GU_CNTL XE_REG(0x101010) +#define LMEM_INIT REG_BIT(7) +#define DRIVERFLR REG_BIT(31) + +#define GU_DEBUG XE_REG(0x101018) +#define DRIVERFLR_STATUS REG_BIT(31) + +#define XEHP_CLOCK_GATE_DIS XE_REG(0x101014) +#define SGSI_SIDECLK_DIS REG_BIT(17) + +#define GGC XE_REG(0x108040) +#define GMS_MASK REG_GENMASK(15, 8) +#define GGMS_MASK REG_GENMASK(7, 6) + +#define DSMBASE XE_REG(0x1080C0) +#define BDSM_MASK REG_GENMASK64(63, 20) + +#define GSMBASE XE_REG(0x108100) + +#define STOLEN_RESERVED XE_REG(0x1082c0) +#define WOPCM_SIZE_MASK REG_GENMASK64(9, 7) + +#define MTL_RP_STATE_CAP XE_REG(0x138000) + +#define MTL_GT_RPE_FREQUENCY XE_REG(0x13800c) + +#define MTL_MEDIAP_STATE_CAP XE_REG(0x138020) +#define MTL_RPN_CAP_MASK REG_GENMASK(24, 16) +#define MTL_RP0_CAP_MASK REG_GENMASK(8, 0) + +#define MTL_MPE_FREQUENCY XE_REG(0x13802c) +#define MTL_RPE_MASK REG_GENMASK(8, 0) + +#define DG1_MSTR_TILE_INTR XE_REG(0x190008) +#define DG1_MSTR_IRQ REG_BIT(31) +#define DG1_MSTR_TILE(t) REG_BIT(t) + +#define GFX_MSTR_IRQ XE_REG(0x190010) +#define MASTER_IRQ REG_BIT(31) +#define GU_MISC_IRQ REG_BIT(29) +#define DISPLAY_IRQ REG_BIT(16) +#define GT_DW_IRQ(x) REG_BIT(x) + +#define PVC_RP_STATE_CAP XE_REG(0x281014) + +#endif diff --git a/drivers/gpu/drm/xe/regs/xe_sriov_regs.h b/drivers/gpu/drm/xe/regs/xe_sriov_regs.h new file mode 100644 index 000000000000..58a4e0fad1e1 --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_sriov_regs.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _REGS_XE_SRIOV_REGS_H_ +#define _REGS_XE_SRIOV_REGS_H_ + +#include "regs/xe_reg_defs.h" + +#define XE2_LMEM_CFG XE_REG(0x48b0) + +#define LMEM_CFG XE_REG(0xcf58) +#define LMEM_EN REG_BIT(31) +#define LMTT_DIR_PTR REG_GENMASK(30, 0) /* in multiples of 64KB */ + +#endif diff --git a/drivers/gpu/drm/xe/tests/Makefile b/drivers/gpu/drm/xe/tests/Makefile new file mode 100644 index 000000000000..39d8a0892274 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_DRM_XE_KUNIT_TEST) += \ + xe_bo_test.o \ + xe_dma_buf_test.o \ + xe_migrate_test.o \ + xe_mocs_test.o \ + xe_pci_test.o \ + xe_rtp_test.o \ + xe_wa_test.o diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c b/drivers/gpu/drm/xe/tests/xe_bo.c new file mode 100644 index 000000000000..412b2e7ce40c --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_bo.c @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: GPL-2.0 AND MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include <kunit/test.h> +#include <kunit/visibility.h> + +#include "tests/xe_bo_test.h" +#include "tests/xe_pci_test.h" +#include "tests/xe_test.h" + +#include "xe_bo_evict.h" +#include "xe_pci.h" +#include "xe_pm.h" + +static int ccs_test_migrate(struct xe_tile *tile, struct xe_bo *bo, + bool clear, u64 get_val, u64 assign_val, + struct kunit *test) +{ + struct dma_fence *fence; + struct ttm_tt *ttm; + struct page *page; + pgoff_t ccs_page; + long timeout; + u64 *cpu_map; + int ret; + u32 offset; + + /* Move bo to VRAM if not already there. */ + ret = xe_bo_validate(bo, NULL, false); + if (ret) { + KUNIT_FAIL(test, "Failed to validate bo.\n"); + return ret; + } + + /* Optionally clear bo *and* CCS data in VRAM. */ + if (clear) { + fence = xe_migrate_clear(tile->migrate, bo, bo->ttm.resource); + if (IS_ERR(fence)) { + KUNIT_FAIL(test, "Failed to submit bo clear.\n"); + return PTR_ERR(fence); + } + dma_fence_put(fence); + } + + /* Evict to system. CCS data should be copied. */ + ret = xe_bo_evict(bo, true); + if (ret) { + KUNIT_FAIL(test, "Failed to evict bo.\n"); + return ret; + } + + /* Sync all migration blits */ + timeout = dma_resv_wait_timeout(bo->ttm.base.resv, + DMA_RESV_USAGE_KERNEL, + true, + 5 * HZ); + if (timeout <= 0) { + KUNIT_FAIL(test, "Failed to sync bo eviction.\n"); + return -ETIME; + } + + /* + * Bo with CCS data is now in system memory. Verify backing store + * and data integrity. Then assign for the next testing round while + * we still have a CPU map. + */ + ttm = bo->ttm.ttm; + if (!ttm || !ttm_tt_is_populated(ttm)) { + KUNIT_FAIL(test, "Bo was not in expected placement.\n"); + return -EINVAL; + } + + ccs_page = xe_bo_ccs_pages_start(bo) >> PAGE_SHIFT; + if (ccs_page >= ttm->num_pages) { + KUNIT_FAIL(test, "No TTM CCS pages present.\n"); + return -EINVAL; + } + + page = ttm->pages[ccs_page]; + cpu_map = kmap_local_page(page); + + /* Check first CCS value */ + if (cpu_map[0] != get_val) { + KUNIT_FAIL(test, + "Expected CCS readout 0x%016llx, got 0x%016llx.\n", + (unsigned long long)get_val, + (unsigned long long)cpu_map[0]); + ret = -EINVAL; + } + + /* Check last CCS value, or at least last value in page. */ + offset = xe_device_ccs_bytes(tile_to_xe(tile), bo->size); + offset = min_t(u32, offset, PAGE_SIZE) / sizeof(u64) - 1; + if (cpu_map[offset] != get_val) { + KUNIT_FAIL(test, + "Expected CCS readout 0x%016llx, got 0x%016llx.\n", + (unsigned long long)get_val, + (unsigned long long)cpu_map[offset]); + ret = -EINVAL; + } + + cpu_map[0] = assign_val; + cpu_map[offset] = assign_val; + kunmap_local(cpu_map); + + return ret; +} + +static void ccs_test_run_tile(struct xe_device *xe, struct xe_tile *tile, + struct kunit *test) +{ + struct xe_bo *bo; + + int ret; + + /* TODO: Sanity check */ + unsigned int bo_flags = XE_BO_CREATE_VRAM_IF_DGFX(tile); + + if (IS_DGFX(xe)) + kunit_info(test, "Testing vram id %u\n", tile->id); + else + kunit_info(test, "Testing system memory\n"); + + bo = xe_bo_create_user(xe, NULL, NULL, SZ_1M, DRM_XE_GEM_CPU_CACHING_WC, + ttm_bo_type_device, bo_flags); + + xe_bo_lock(bo, false); + + if (IS_ERR(bo)) { + KUNIT_FAIL(test, "Failed to create bo.\n"); + return; + } + + kunit_info(test, "Verifying that CCS data is cleared on creation.\n"); + ret = ccs_test_migrate(tile, bo, false, 0ULL, 0xdeadbeefdeadbeefULL, + test); + if (ret) + goto out_unlock; + + kunit_info(test, "Verifying that CCS data survives migration.\n"); + ret = ccs_test_migrate(tile, bo, false, 0xdeadbeefdeadbeefULL, + 0xdeadbeefdeadbeefULL, test); + if (ret) + goto out_unlock; + + kunit_info(test, "Verifying that CCS data can be properly cleared.\n"); + ret = ccs_test_migrate(tile, bo, true, 0ULL, 0ULL, test); + +out_unlock: + xe_bo_unlock(bo); + xe_bo_put(bo); +} + +static int ccs_test_run_device(struct xe_device *xe) +{ + struct kunit *test = xe_cur_kunit(); + struct xe_tile *tile; + int id; + + if (!xe_device_has_flat_ccs(xe)) { + kunit_info(test, "Skipping non-flat-ccs device.\n"); + return 0; + } + + xe_device_mem_access_get(xe); + + for_each_tile(tile, xe, id) { + /* For igfx run only for primary tile */ + if (!IS_DGFX(xe) && id > 0) + continue; + ccs_test_run_tile(xe, tile, test); + } + + xe_device_mem_access_put(xe); + + return 0; +} + +void xe_ccs_migrate_kunit(struct kunit *test) +{ + xe_call_for_each_device(ccs_test_run_device); +} +EXPORT_SYMBOL_IF_KUNIT(xe_ccs_migrate_kunit); + +static int evict_test_run_tile(struct xe_device *xe, struct xe_tile *tile, struct kunit *test) +{ + struct xe_bo *bo, *external; + unsigned int bo_flags = XE_BO_CREATE_VRAM_IF_DGFX(tile); + struct xe_vm *vm = xe_migrate_get_vm(xe_device_get_root_tile(xe)->migrate); + struct xe_gt *__gt; + int err, i, id; + + kunit_info(test, "Testing device %s vram id %u\n", + dev_name(xe->drm.dev), tile->id); + + for (i = 0; i < 2; ++i) { + xe_vm_lock(vm, false); + bo = xe_bo_create_user(xe, NULL, vm, 0x10000, + DRM_XE_GEM_CPU_CACHING_WC, + ttm_bo_type_device, + bo_flags); + xe_vm_unlock(vm); + if (IS_ERR(bo)) { + KUNIT_FAIL(test, "bo create err=%pe\n", bo); + break; + } + + external = xe_bo_create_user(xe, NULL, NULL, 0x10000, + DRM_XE_GEM_CPU_CACHING_WC, + ttm_bo_type_device, bo_flags); + if (IS_ERR(external)) { + KUNIT_FAIL(test, "external bo create err=%pe\n", external); + goto cleanup_bo; + } + + xe_bo_lock(external, false); + err = xe_bo_pin_external(external); + xe_bo_unlock(external); + if (err) { + KUNIT_FAIL(test, "external bo pin err=%pe\n", + ERR_PTR(err)); + goto cleanup_external; + } + + err = xe_bo_evict_all(xe); + if (err) { + KUNIT_FAIL(test, "evict err=%pe\n", ERR_PTR(err)); + goto cleanup_all; + } + + for_each_gt(__gt, xe, id) + xe_gt_sanitize(__gt); + err = xe_bo_restore_kernel(xe); + /* + * Snapshotting the CTB and copying back a potentially old + * version seems risky, depending on what might have been + * inflight. Also it seems snapshotting the ADS object and + * copying back results in serious breakage. Normally when + * calling xe_bo_restore_kernel() we always fully restart the + * GT, which re-intializes such things. We could potentially + * skip saving and restoring such objects in xe_bo_evict_all() + * however seems quite fragile not to also restart the GT. Try + * to do that here by triggering a GT reset. + */ + for_each_gt(__gt, xe, id) { + xe_gt_reset_async(__gt); + flush_work(&__gt->reset.worker); + } + if (err) { + KUNIT_FAIL(test, "restore kernel err=%pe\n", + ERR_PTR(err)); + goto cleanup_all; + } + + err = xe_bo_restore_user(xe); + if (err) { + KUNIT_FAIL(test, "restore user err=%pe\n", ERR_PTR(err)); + goto cleanup_all; + } + + if (!xe_bo_is_vram(external)) { + KUNIT_FAIL(test, "external bo is not vram\n"); + err = -EPROTO; + goto cleanup_all; + } + + if (xe_bo_is_vram(bo)) { + KUNIT_FAIL(test, "bo is vram\n"); + err = -EPROTO; + goto cleanup_all; + } + + if (i) { + down_read(&vm->lock); + xe_vm_lock(vm, false); + err = xe_bo_validate(bo, bo->vm, false); + xe_vm_unlock(vm); + up_read(&vm->lock); + if (err) { + KUNIT_FAIL(test, "bo valid err=%pe\n", + ERR_PTR(err)); + goto cleanup_all; + } + xe_bo_lock(external, false); + err = xe_bo_validate(external, NULL, false); + xe_bo_unlock(external); + if (err) { + KUNIT_FAIL(test, "external bo valid err=%pe\n", + ERR_PTR(err)); + goto cleanup_all; + } + } + + xe_bo_lock(external, false); + xe_bo_unpin_external(external); + xe_bo_unlock(external); + + xe_bo_put(external); + + xe_bo_lock(bo, false); + __xe_bo_unset_bulk_move(bo); + xe_bo_unlock(bo); + xe_bo_put(bo); + continue; + +cleanup_all: + xe_bo_lock(external, false); + xe_bo_unpin_external(external); + xe_bo_unlock(external); +cleanup_external: + xe_bo_put(external); +cleanup_bo: + xe_bo_lock(bo, false); + __xe_bo_unset_bulk_move(bo); + xe_bo_unlock(bo); + xe_bo_put(bo); + break; + } + + xe_vm_put(vm); + + return 0; +} + +static int evict_test_run_device(struct xe_device *xe) +{ + struct kunit *test = xe_cur_kunit(); + struct xe_tile *tile; + int id; + + if (!IS_DGFX(xe)) { + kunit_info(test, "Skipping non-discrete device %s.\n", + dev_name(xe->drm.dev)); + return 0; + } + + xe_device_mem_access_get(xe); + + for_each_tile(tile, xe, id) + evict_test_run_tile(xe, tile, test); + + xe_device_mem_access_put(xe); + + return 0; +} + +void xe_bo_evict_kunit(struct kunit *test) +{ + xe_call_for_each_device(evict_test_run_device); +} +EXPORT_SYMBOL_IF_KUNIT(xe_bo_evict_kunit); diff --git a/drivers/gpu/drm/xe/tests/xe_bo_test.c b/drivers/gpu/drm/xe/tests/xe_bo_test.c new file mode 100644 index 000000000000..f408f17f2164 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_bo_test.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_bo_test.h" + +#include <kunit/test.h> + +static struct kunit_case xe_bo_tests[] = { + KUNIT_CASE(xe_ccs_migrate_kunit), + KUNIT_CASE(xe_bo_evict_kunit), + {} +}; + +static struct kunit_suite xe_bo_test_suite = { + .name = "xe_bo", + .test_cases = xe_bo_tests, +}; + +kunit_test_suite(xe_bo_test_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xe_bo kunit test"); +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); diff --git a/drivers/gpu/drm/xe/tests/xe_bo_test.h b/drivers/gpu/drm/xe/tests/xe_bo_test.h new file mode 100644 index 000000000000..0113ab45066a --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_bo_test.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 AND MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_BO_TEST_H_ +#define _XE_BO_TEST_H_ + +struct kunit; + +void xe_ccs_migrate_kunit(struct kunit *test); +void xe_bo_evict_kunit(struct kunit *test); + +#endif diff --git a/drivers/gpu/drm/xe/tests/xe_dma_buf.c b/drivers/gpu/drm/xe/tests/xe_dma_buf.c new file mode 100644 index 000000000000..9f6d571d7fa9 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_dma_buf.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0 AND MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include <drm/xe_drm.h> + +#include <kunit/test.h> +#include <kunit/visibility.h> + +#include "tests/xe_dma_buf_test.h" +#include "tests/xe_pci_test.h" + +#include "xe_pci.h" + +static bool p2p_enabled(struct dma_buf_test_params *params) +{ + return IS_ENABLED(CONFIG_PCI_P2PDMA) && params->attach_ops && + params->attach_ops->allow_peer2peer; +} + +static bool is_dynamic(struct dma_buf_test_params *params) +{ + return IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY) && params->attach_ops && + params->attach_ops->move_notify; +} + +static void check_residency(struct kunit *test, struct xe_bo *exported, + struct xe_bo *imported, struct dma_buf *dmabuf) +{ + struct dma_buf_test_params *params = to_dma_buf_test_params(test->priv); + u32 mem_type; + int ret; + + xe_bo_assert_held(exported); + xe_bo_assert_held(imported); + + mem_type = XE_PL_VRAM0; + if (!(params->mem_mask & XE_BO_CREATE_VRAM0_BIT)) + /* No VRAM allowed */ + mem_type = XE_PL_TT; + else if (params->force_different_devices && !p2p_enabled(params)) + /* No P2P */ + mem_type = XE_PL_TT; + else if (params->force_different_devices && !is_dynamic(params) && + (params->mem_mask & XE_BO_CREATE_SYSTEM_BIT)) + /* Pin migrated to TT */ + mem_type = XE_PL_TT; + + if (!xe_bo_is_mem_type(exported, mem_type)) { + KUNIT_FAIL(test, "Exported bo was not in expected memory type.\n"); + return; + } + + if (xe_bo_is_pinned(exported)) + return; + + /* + * Evict exporter. Note that the gem object dma_buf member isn't + * set from xe_gem_prime_export(), and it's needed for the move_notify() + * functionality, so hack that up here. Evicting the exported bo will + * evict also the imported bo through the move_notify() functionality if + * importer is on a different device. If they're on the same device, + * the exporter and the importer should be the same bo. + */ + swap(exported->ttm.base.dma_buf, dmabuf); + ret = xe_bo_evict(exported, true); + swap(exported->ttm.base.dma_buf, dmabuf); + if (ret) { + if (ret != -EINTR && ret != -ERESTARTSYS) + KUNIT_FAIL(test, "Evicting exporter failed with err=%d.\n", + ret); + return; + } + + /* Verify that also importer has been evicted to SYSTEM */ + if (exported != imported && !xe_bo_is_mem_type(imported, XE_PL_SYSTEM)) { + KUNIT_FAIL(test, "Importer wasn't properly evicted.\n"); + return; + } + + /* Re-validate the importer. This should move also exporter in. */ + ret = xe_bo_validate(imported, NULL, false); + if (ret) { + if (ret != -EINTR && ret != -ERESTARTSYS) + KUNIT_FAIL(test, "Validating importer failed with err=%d.\n", + ret); + return; + } + + /* + * If on different devices, the exporter is kept in system if + * possible, saving a migration step as the transfer is just + * likely as fast from system memory. + */ + if (params->mem_mask & XE_BO_CREATE_SYSTEM_BIT) + KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, XE_PL_TT)); + else + KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type)); + + if (params->force_different_devices) + KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(imported, XE_PL_TT)); + else + KUNIT_EXPECT_TRUE(test, exported == imported); +} + +static void xe_test_dmabuf_import_same_driver(struct xe_device *xe) +{ + struct kunit *test = xe_cur_kunit(); + struct dma_buf_test_params *params = to_dma_buf_test_params(test->priv); + struct drm_gem_object *import; + struct dma_buf *dmabuf; + struct xe_bo *bo; + size_t size; + + /* No VRAM on this device? */ + if (!ttm_manager_type(&xe->ttm, XE_PL_VRAM0) && + (params->mem_mask & XE_BO_CREATE_VRAM0_BIT)) + return; + + size = PAGE_SIZE; + if ((params->mem_mask & XE_BO_CREATE_VRAM0_BIT) && + xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K) + size = SZ_64K; + + kunit_info(test, "running %s\n", __func__); + bo = xe_bo_create_user(xe, NULL, NULL, size, DRM_XE_GEM_CPU_CACHING_WC, + ttm_bo_type_device, XE_BO_CREATE_USER_BIT | params->mem_mask); + if (IS_ERR(bo)) { + KUNIT_FAIL(test, "xe_bo_create() failed with err=%ld\n", + PTR_ERR(bo)); + return; + } + + dmabuf = xe_gem_prime_export(&bo->ttm.base, 0); + if (IS_ERR(dmabuf)) { + KUNIT_FAIL(test, "xe_gem_prime_export() failed with err=%ld\n", + PTR_ERR(dmabuf)); + goto out; + } + + import = xe_gem_prime_import(&xe->drm, dmabuf); + if (!IS_ERR(import)) { + struct xe_bo *import_bo = gem_to_xe_bo(import); + + /* + * Did import succeed when it shouldn't due to lack of p2p support? + */ + if (params->force_different_devices && + !p2p_enabled(params) && + !(params->mem_mask & XE_BO_CREATE_SYSTEM_BIT)) { + KUNIT_FAIL(test, + "xe_gem_prime_import() succeeded when it shouldn't have\n"); + } else { + int err; + + /* Is everything where we expect it to be? */ + xe_bo_lock(import_bo, false); + err = xe_bo_validate(import_bo, NULL, false); + + /* Pinning in VRAM is not allowed. */ + if (!is_dynamic(params) && + params->force_different_devices && + !(params->mem_mask & XE_BO_CREATE_SYSTEM_BIT)) + KUNIT_EXPECT_EQ(test, err, -EINVAL); + /* Otherwise only expect interrupts or success. */ + else if (err && err != -EINTR && err != -ERESTARTSYS) + KUNIT_EXPECT_TRUE(test, !err || err == -EINTR || + err == -ERESTARTSYS); + + if (!err) + check_residency(test, bo, import_bo, dmabuf); + xe_bo_unlock(import_bo); + } + drm_gem_object_put(import); + } else if (PTR_ERR(import) != -EOPNOTSUPP) { + /* Unexpected error code. */ + KUNIT_FAIL(test, + "xe_gem_prime_import failed with the wrong err=%ld\n", + PTR_ERR(import)); + } else if (!params->force_different_devices || + p2p_enabled(params) || + (params->mem_mask & XE_BO_CREATE_SYSTEM_BIT)) { + /* Shouldn't fail if we can reuse same bo, use p2p or use system */ + KUNIT_FAIL(test, "dynamic p2p attachment failed with err=%ld\n", + PTR_ERR(import)); + } + dma_buf_put(dmabuf); +out: + drm_gem_object_put(&bo->ttm.base); +} + +static const struct dma_buf_attach_ops nop2p_attach_ops = { + .allow_peer2peer = false, + .move_notify = xe_dma_buf_move_notify +}; + +/* + * We test the implementation with bos of different residency and with + * importers with different capabilities; some lacking p2p support and some + * lacking dynamic capabilities (attach_ops == NULL). We also fake + * different devices avoiding the import shortcut that just reuses the same + * gem object. + */ +static const struct dma_buf_test_params test_params[] = { + {.mem_mask = XE_BO_CREATE_VRAM0_BIT, + .attach_ops = &xe_dma_buf_attach_ops}, + {.mem_mask = XE_BO_CREATE_VRAM0_BIT, + .attach_ops = &xe_dma_buf_attach_ops, + .force_different_devices = true}, + + {.mem_mask = XE_BO_CREATE_VRAM0_BIT, + .attach_ops = &nop2p_attach_ops}, + {.mem_mask = XE_BO_CREATE_VRAM0_BIT, + .attach_ops = &nop2p_attach_ops, + .force_different_devices = true}, + + {.mem_mask = XE_BO_CREATE_VRAM0_BIT}, + {.mem_mask = XE_BO_CREATE_VRAM0_BIT, + .force_different_devices = true}, + + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT, + .attach_ops = &xe_dma_buf_attach_ops}, + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT, + .attach_ops = &xe_dma_buf_attach_ops, + .force_different_devices = true}, + + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT, + .attach_ops = &nop2p_attach_ops}, + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT, + .attach_ops = &nop2p_attach_ops, + .force_different_devices = true}, + + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT}, + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT, + .force_different_devices = true}, + + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_VRAM0_BIT, + .attach_ops = &xe_dma_buf_attach_ops}, + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_VRAM0_BIT, + .attach_ops = &xe_dma_buf_attach_ops, + .force_different_devices = true}, + + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_VRAM0_BIT, + .attach_ops = &nop2p_attach_ops}, + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_VRAM0_BIT, + .attach_ops = &nop2p_attach_ops, + .force_different_devices = true}, + + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_VRAM0_BIT}, + {.mem_mask = XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_VRAM0_BIT, + .force_different_devices = true}, + + {} +}; + +static int dma_buf_run_device(struct xe_device *xe) +{ + const struct dma_buf_test_params *params; + struct kunit *test = xe_cur_kunit(); + + for (params = test_params; params->mem_mask; ++params) { + struct dma_buf_test_params p = *params; + + p.base.id = XE_TEST_LIVE_DMA_BUF; + test->priv = &p; + xe_test_dmabuf_import_same_driver(xe); + } + + /* A non-zero return would halt iteration over driver devices */ + return 0; +} + +void xe_dma_buf_kunit(struct kunit *test) +{ + xe_call_for_each_device(dma_buf_run_device); +} +EXPORT_SYMBOL_IF_KUNIT(xe_dma_buf_kunit); diff --git a/drivers/gpu/drm/xe/tests/xe_dma_buf_test.c b/drivers/gpu/drm/xe/tests/xe_dma_buf_test.c new file mode 100644 index 000000000000..9f5a9cda8c0f --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_dma_buf_test.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_dma_buf_test.h" + +#include <kunit/test.h> + +static struct kunit_case xe_dma_buf_tests[] = { + KUNIT_CASE(xe_dma_buf_kunit), + {} +}; + +static struct kunit_suite xe_dma_buf_test_suite = { + .name = "xe_dma_buf", + .test_cases = xe_dma_buf_tests, +}; + +kunit_test_suite(xe_dma_buf_test_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xe_dma_buf kunit test"); +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); diff --git a/drivers/gpu/drm/xe/tests/xe_dma_buf_test.h b/drivers/gpu/drm/xe/tests/xe_dma_buf_test.h new file mode 100644 index 000000000000..e6b464ddd526 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_dma_buf_test.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 AND MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_DMA_BUF_TEST_H_ +#define _XE_DMA_BUF_TEST_H_ + +struct kunit; + +void xe_dma_buf_kunit(struct kunit *test); + +#endif diff --git a/drivers/gpu/drm/xe/tests/xe_lmtt_test.c b/drivers/gpu/drm/xe/tests/xe_lmtt_test.c new file mode 100644 index 000000000000..1f1557c45ae1 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_lmtt_test.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 AND MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <kunit/test.h> + +static const struct lmtt_ops_param { + const char *desc; + const struct xe_lmtt_ops *ops; +} lmtt_ops_params[] = { + { "2-level", &lmtt_2l_ops, }, + { "multi-level", &lmtt_ml_ops, }, +}; + +static void lmtt_ops_param_get_desc(const struct lmtt_ops_param *p, char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s", p->desc); +} + +KUNIT_ARRAY_PARAM(lmtt_ops, lmtt_ops_params, lmtt_ops_param_get_desc); + +static void test_ops(struct kunit *test) +{ + const struct lmtt_ops_param *p = test->param_value; + const struct xe_lmtt_ops *ops = p->ops; + unsigned int n; + + KUNIT_ASSERT_NOT_NULL(test, ops->lmtt_root_pd_level); + KUNIT_ASSERT_NOT_NULL(test, ops->lmtt_pte_num); + KUNIT_ASSERT_NOT_NULL(test, ops->lmtt_pte_size); + KUNIT_ASSERT_NOT_NULL(test, ops->lmtt_pte_shift); + KUNIT_ASSERT_NOT_NULL(test, ops->lmtt_pte_index); + KUNIT_ASSERT_NOT_NULL(test, ops->lmtt_pte_encode); + + KUNIT_EXPECT_NE(test, ops->lmtt_root_pd_level(), 0); + + for (n = 0; n <= ops->lmtt_root_pd_level(); n++) { + KUNIT_EXPECT_NE_MSG(test, ops->lmtt_pte_num(n), 0, + "level=%u", n); + KUNIT_EXPECT_NE_MSG(test, ops->lmtt_pte_size(n), 0, + "level=%u", n); + KUNIT_EXPECT_NE_MSG(test, ops->lmtt_pte_encode(0, n), LMTT_PTE_INVALID, + "level=%u", n); + } + + for (n = 0; n < ops->lmtt_root_pd_level(); n++) { + u64 addr = BIT_ULL(ops->lmtt_pte_shift(n)); + + KUNIT_EXPECT_NE_MSG(test, ops->lmtt_pte_shift(n), 0, + "level=%u", n); + KUNIT_EXPECT_EQ_MSG(test, ops->lmtt_pte_index(addr - 1, n), 0, + "addr=%#llx level=%u", addr, n); + KUNIT_EXPECT_EQ_MSG(test, ops->lmtt_pte_index(addr + 1, n), 1, + "addr=%#llx level=%u", addr, n); + KUNIT_EXPECT_EQ_MSG(test, ops->lmtt_pte_index(addr * 2 - 1, n), 1, + "addr=%#llx level=%u", addr, n); + KUNIT_EXPECT_EQ_MSG(test, ops->lmtt_pte_index(addr * 2, n), 2, + "addr=%#llx level=%u", addr, n); + } +} + +static struct kunit_case lmtt_test_cases[] = { + KUNIT_CASE_PARAM(test_ops, lmtt_ops_gen_params), + {} +}; + +static struct kunit_suite lmtt_suite = { + .name = "lmtt", + .test_cases = lmtt_test_cases, +}; + +kunit_test_suites(&lmtt_suite); diff --git a/drivers/gpu/drm/xe/tests/xe_migrate.c b/drivers/gpu/drm/xe/tests/xe_migrate.c new file mode 100644 index 000000000000..7a32faa2f688 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_migrate.c @@ -0,0 +1,444 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020-2022 Intel Corporation + */ + +#include <kunit/test.h> +#include <kunit/visibility.h> + +#include "tests/xe_migrate_test.h" +#include "tests/xe_pci_test.h" + +#include "xe_pci.h" + +static bool sanity_fence_failed(struct xe_device *xe, struct dma_fence *fence, + const char *str, struct kunit *test) +{ + long ret; + + if (IS_ERR(fence)) { + KUNIT_FAIL(test, "Failed to create fence for %s: %li\n", str, + PTR_ERR(fence)); + return true; + } + if (!fence) + return true; + + ret = dma_fence_wait_timeout(fence, false, 5 * HZ); + if (ret <= 0) { + KUNIT_FAIL(test, "Fence timed out for %s: %li\n", str, ret); + return true; + } + + return false; +} + +static int run_sanity_job(struct xe_migrate *m, struct xe_device *xe, + struct xe_bb *bb, u32 second_idx, const char *str, + struct kunit *test) +{ + u64 batch_base = xe_migrate_batch_base(m, xe->info.has_usm); + struct xe_sched_job *job = xe_bb_create_migration_job(m->q, bb, + batch_base, + second_idx); + struct dma_fence *fence; + + if (IS_ERR(job)) { + KUNIT_FAIL(test, "Failed to allocate fake pt: %li\n", + PTR_ERR(job)); + return PTR_ERR(job); + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + if (sanity_fence_failed(xe, fence, str, test)) + return -ETIMEDOUT; + + dma_fence_put(fence); + kunit_info(test, "%s: Job completed\n", str); + return 0; +} + +static void +sanity_populate_cb(struct xe_migrate_pt_update *pt_update, + struct xe_tile *tile, struct iosys_map *map, void *dst, + u32 qword_ofs, u32 num_qwords, + const struct xe_vm_pgtable_update *update) +{ + struct migrate_test_params *p = + to_migrate_test_params(xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE)); + int i; + u64 *ptr = dst; + u64 value; + + for (i = 0; i < num_qwords; i++) { + value = (qword_ofs + i - update->ofs) * 0x1111111111111111ULL; + if (map) + xe_map_wr(tile_to_xe(tile), map, (qword_ofs + i) * + sizeof(u64), u64, value); + else + ptr[i] = value; + } + + kunit_info(xe_cur_kunit(), "Used %s.\n", map ? "CPU" : "GPU"); + if (p->force_gpu && map) + KUNIT_FAIL(xe_cur_kunit(), "GPU pagetable update used CPU.\n"); +} + +static const struct xe_migrate_pt_update_ops sanity_ops = { + .populate = sanity_populate_cb, +}; + +#define check(_retval, _expected, str, _test) \ + do { if ((_retval) != (_expected)) { \ + KUNIT_FAIL(_test, "Sanity check failed: " str \ + " expected %llx, got %llx\n", \ + (u64)(_expected), (u64)(_retval)); \ + } } while (0) + +static void test_copy(struct xe_migrate *m, struct xe_bo *bo, + struct kunit *test, u32 region) +{ + struct xe_device *xe = tile_to_xe(m->tile); + u64 retval, expected = 0; + bool big = bo->size >= SZ_2M; + struct dma_fence *fence; + const char *str = big ? "Copying big bo" : "Copying small bo"; + int err; + + struct xe_bo *remote = xe_bo_create_locked(xe, m->tile, NULL, + bo->size, + ttm_bo_type_kernel, + region | + XE_BO_NEEDS_CPU_ACCESS); + if (IS_ERR(remote)) { + KUNIT_FAIL(test, "Failed to allocate remote bo for %s: %li\n", + str, PTR_ERR(remote)); + return; + } + + err = xe_bo_validate(remote, NULL, false); + if (err) { + KUNIT_FAIL(test, "Failed to validate system bo for %s: %li\n", + str, err); + goto out_unlock; + } + + err = xe_bo_vmap(remote); + if (err) { + KUNIT_FAIL(test, "Failed to vmap system bo for %s: %li\n", + str, err); + goto out_unlock; + } + + xe_map_memset(xe, &remote->vmap, 0, 0xd0, remote->size); + fence = xe_migrate_clear(m, remote, remote->ttm.resource); + if (!sanity_fence_failed(xe, fence, big ? "Clearing remote big bo" : + "Clearing remote small bo", test)) { + retval = xe_map_rd(xe, &remote->vmap, 0, u64); + check(retval, expected, "remote first offset should be cleared", + test); + retval = xe_map_rd(xe, &remote->vmap, remote->size - 8, u64); + check(retval, expected, "remote last offset should be cleared", + test); + } + dma_fence_put(fence); + + /* Try to copy 0xc0 from remote to vram with 2MB or 64KiB/4KiB pages */ + xe_map_memset(xe, &remote->vmap, 0, 0xc0, remote->size); + xe_map_memset(xe, &bo->vmap, 0, 0xd0, bo->size); + + expected = 0xc0c0c0c0c0c0c0c0; + fence = xe_migrate_copy(m, remote, bo, remote->ttm.resource, + bo->ttm.resource, false); + if (!sanity_fence_failed(xe, fence, big ? "Copying big bo remote -> vram" : + "Copying small bo remote -> vram", test)) { + retval = xe_map_rd(xe, &bo->vmap, 0, u64); + check(retval, expected, + "remote -> vram bo first offset should be copied", test); + retval = xe_map_rd(xe, &bo->vmap, bo->size - 8, u64); + check(retval, expected, + "remote -> vram bo offset should be copied", test); + } + dma_fence_put(fence); + + /* And other way around.. slightly hacky.. */ + xe_map_memset(xe, &remote->vmap, 0, 0xd0, remote->size); + xe_map_memset(xe, &bo->vmap, 0, 0xc0, bo->size); + + fence = xe_migrate_copy(m, bo, remote, bo->ttm.resource, + remote->ttm.resource, false); + if (!sanity_fence_failed(xe, fence, big ? "Copying big bo vram -> remote" : + "Copying small bo vram -> remote", test)) { + retval = xe_map_rd(xe, &remote->vmap, 0, u64); + check(retval, expected, + "vram -> remote bo first offset should be copied", test); + retval = xe_map_rd(xe, &remote->vmap, bo->size - 8, u64); + check(retval, expected, + "vram -> remote bo last offset should be copied", test); + } + dma_fence_put(fence); + + xe_bo_vunmap(remote); +out_unlock: + xe_bo_unlock(remote); + xe_bo_put(remote); +} + +static void test_copy_sysmem(struct xe_migrate *m, struct xe_bo *bo, + struct kunit *test) +{ + test_copy(m, bo, test, XE_BO_CREATE_SYSTEM_BIT); +} + +static void test_copy_vram(struct xe_migrate *m, struct xe_bo *bo, + struct kunit *test) +{ + u32 region; + + if (bo->ttm.resource->mem_type == XE_PL_SYSTEM) + return; + + if (bo->ttm.resource->mem_type == XE_PL_VRAM0) + region = XE_BO_CREATE_VRAM1_BIT; + else + region = XE_BO_CREATE_VRAM0_BIT; + test_copy(m, bo, test, region); +} + +static void test_pt_update(struct xe_migrate *m, struct xe_bo *pt, + struct kunit *test, bool force_gpu) +{ + struct xe_device *xe = tile_to_xe(m->tile); + struct dma_fence *fence; + u64 retval, expected; + ktime_t then, now; + int i; + + struct xe_vm_pgtable_update update = { + .ofs = 1, + .qwords = 0x10, + .pt_bo = pt, + }; + struct xe_migrate_pt_update pt_update = { + .ops = &sanity_ops, + }; + struct migrate_test_params p = { + .base.id = XE_TEST_LIVE_MIGRATE, + .force_gpu = force_gpu, + }; + + test->priv = &p; + /* Test xe_migrate_update_pgtables() updates the pagetable as expected */ + expected = 0xf0f0f0f0f0f0f0f0ULL; + xe_map_memset(xe, &pt->vmap, 0, (u8)expected, pt->size); + + then = ktime_get(); + fence = xe_migrate_update_pgtables(m, m->q->vm, NULL, m->q, &update, 1, + NULL, 0, &pt_update); + now = ktime_get(); + if (sanity_fence_failed(xe, fence, "Migration pagetable update", test)) + return; + + kunit_info(test, "Updating without syncing took %llu us,\n", + (unsigned long long)ktime_to_us(ktime_sub(now, then))); + + dma_fence_put(fence); + retval = xe_map_rd(xe, &pt->vmap, 0, u64); + check(retval, expected, "PTE[0] must stay untouched", test); + + for (i = 0; i < update.qwords; i++) { + retval = xe_map_rd(xe, &pt->vmap, (update.ofs + i) * 8, u64); + check(retval, i * 0x1111111111111111ULL, "PTE update", test); + } + + retval = xe_map_rd(xe, &pt->vmap, 8 * (update.ofs + update.qwords), + u64); + check(retval, expected, "PTE[0x11] must stay untouched", test); +} + +static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test) +{ + struct xe_tile *tile = m->tile; + struct xe_device *xe = tile_to_xe(tile); + struct xe_bo *pt, *bo = m->pt_bo, *big, *tiny; + struct xe_res_cursor src_it; + struct dma_fence *fence; + u64 retval, expected; + struct xe_bb *bb; + int err; + u8 id = tile->id; + + err = xe_bo_vmap(bo); + if (err) { + KUNIT_FAIL(test, "Failed to vmap our pagetables: %li\n", + PTR_ERR(bo)); + return; + } + + big = xe_bo_create_pin_map(xe, tile, m->q->vm, SZ_4M, + ttm_bo_type_kernel, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_PINNED_BIT); + if (IS_ERR(big)) { + KUNIT_FAIL(test, "Failed to allocate bo: %li\n", PTR_ERR(big)); + goto vunmap; + } + + pt = xe_bo_create_pin_map(xe, tile, m->q->vm, XE_PAGE_SIZE, + ttm_bo_type_kernel, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_PINNED_BIT); + if (IS_ERR(pt)) { + KUNIT_FAIL(test, "Failed to allocate fake pt: %li\n", + PTR_ERR(pt)); + goto free_big; + } + + tiny = xe_bo_create_pin_map(xe, tile, m->q->vm, + 2 * SZ_4K, + ttm_bo_type_kernel, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_PINNED_BIT); + if (IS_ERR(tiny)) { + KUNIT_FAIL(test, "Failed to allocate fake pt: %li\n", + PTR_ERR(pt)); + goto free_pt; + } + + bb = xe_bb_new(tile->primary_gt, 32, xe->info.has_usm); + if (IS_ERR(bb)) { + KUNIT_FAIL(test, "Failed to create batchbuffer: %li\n", + PTR_ERR(bb)); + goto free_tiny; + } + + kunit_info(test, "Starting tests, top level PT addr: %lx, special pagetable base addr: %lx\n", + (unsigned long)xe_bo_main_addr(m->q->vm->pt_root[id]->bo, XE_PAGE_SIZE), + (unsigned long)xe_bo_main_addr(m->pt_bo, XE_PAGE_SIZE)); + + /* First part of the test, are we updating our pagetable bo with a new entry? */ + xe_map_wr(xe, &bo->vmap, XE_PAGE_SIZE * (NUM_KERNEL_PDE - 1), u64, + 0xdeaddeadbeefbeef); + expected = m->q->vm->pt_ops->pte_encode_bo(pt, 0, xe->pat.idx[XE_CACHE_WB], 0); + if (m->q->vm->flags & XE_VM_FLAG_64K) + expected |= XE_PTE_PS64; + if (xe_bo_is_vram(pt)) + xe_res_first(pt->ttm.resource, 0, pt->size, &src_it); + else + xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it); + + emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt), false, + &src_it, XE_PAGE_SIZE, pt); + + run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test); + + retval = xe_map_rd(xe, &bo->vmap, XE_PAGE_SIZE * (NUM_KERNEL_PDE - 1), + u64); + check(retval, expected, "PTE entry write", test); + + /* Now try to write data to our newly mapped 'pagetable', see if it succeeds */ + bb->len = 0; + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + xe_map_wr(xe, &pt->vmap, 0, u32, 0xdeaddead); + expected = 0; + + emit_clear(tile->primary_gt, bb, xe_migrate_vm_addr(NUM_KERNEL_PDE - 1, 0), 4, 4, + IS_DGFX(xe)); + run_sanity_job(m, xe, bb, 1, "Writing to our newly mapped pagetable", + test); + + retval = xe_map_rd(xe, &pt->vmap, 0, u32); + check(retval, expected, "Write to PT after adding PTE", test); + + /* Sanity checks passed, try the full ones! */ + + /* Clear a small bo */ + kunit_info(test, "Clearing small buffer object\n"); + xe_map_memset(xe, &tiny->vmap, 0, 0x22, tiny->size); + expected = 0; + fence = xe_migrate_clear(m, tiny, tiny->ttm.resource); + if (sanity_fence_failed(xe, fence, "Clearing small bo", test)) + goto out; + + dma_fence_put(fence); + retval = xe_map_rd(xe, &tiny->vmap, 0, u32); + check(retval, expected, "Command clear small first value", test); + retval = xe_map_rd(xe, &tiny->vmap, tiny->size - 4, u32); + check(retval, expected, "Command clear small last value", test); + + kunit_info(test, "Copying small buffer object to system\n"); + test_copy_sysmem(m, tiny, test); + if (xe->info.tile_count > 1) { + kunit_info(test, "Copying small buffer object to other vram\n"); + test_copy_vram(m, tiny, test); + } + + /* Clear a big bo */ + kunit_info(test, "Clearing big buffer object\n"); + xe_map_memset(xe, &big->vmap, 0, 0x11, big->size); + expected = 0; + fence = xe_migrate_clear(m, big, big->ttm.resource); + if (sanity_fence_failed(xe, fence, "Clearing big bo", test)) + goto out; + + dma_fence_put(fence); + retval = xe_map_rd(xe, &big->vmap, 0, u32); + check(retval, expected, "Command clear big first value", test); + retval = xe_map_rd(xe, &big->vmap, big->size - 4, u32); + check(retval, expected, "Command clear big last value", test); + + kunit_info(test, "Copying big buffer object to system\n"); + test_copy_sysmem(m, big, test); + if (xe->info.tile_count > 1) { + kunit_info(test, "Copying big buffer object to other vram\n"); + test_copy_vram(m, big, test); + } + + kunit_info(test, "Testing page table update using CPU if GPU idle.\n"); + test_pt_update(m, pt, test, false); + kunit_info(test, "Testing page table update using GPU\n"); + test_pt_update(m, pt, test, true); + +out: + xe_bb_free(bb, NULL); +free_tiny: + xe_bo_unpin(tiny); + xe_bo_put(tiny); +free_pt: + xe_bo_unpin(pt); + xe_bo_put(pt); +free_big: + xe_bo_unpin(big); + xe_bo_put(big); +vunmap: + xe_bo_vunmap(m->pt_bo); +} + +static int migrate_test_run_device(struct xe_device *xe) +{ + struct kunit *test = xe_cur_kunit(); + struct xe_tile *tile; + int id; + + for_each_tile(tile, xe, id) { + struct xe_migrate *m = tile->migrate; + + kunit_info(test, "Testing tile id %d.\n", id); + xe_vm_lock(m->q->vm, true); + xe_device_mem_access_get(xe); + xe_migrate_sanity_test(m, test); + xe_device_mem_access_put(xe); + xe_vm_unlock(m->q->vm); + } + + return 0; +} + +void xe_migrate_sanity_kunit(struct kunit *test) +{ + xe_call_for_each_device(migrate_test_run_device); +} +EXPORT_SYMBOL_IF_KUNIT(xe_migrate_sanity_kunit); diff --git a/drivers/gpu/drm/xe/tests/xe_migrate_test.c b/drivers/gpu/drm/xe/tests/xe_migrate_test.c new file mode 100644 index 000000000000..cf0c173b945f --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_migrate_test.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_migrate_test.h" + +#include <kunit/test.h> + +static struct kunit_case xe_migrate_tests[] = { + KUNIT_CASE(xe_migrate_sanity_kunit), + {} +}; + +static struct kunit_suite xe_migrate_test_suite = { + .name = "xe_migrate", + .test_cases = xe_migrate_tests, +}; + +kunit_test_suite(xe_migrate_test_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xe_migrate kunit test"); +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); diff --git a/drivers/gpu/drm/xe/tests/xe_migrate_test.h b/drivers/gpu/drm/xe/tests/xe_migrate_test.h new file mode 100644 index 000000000000..7c645c66824f --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_migrate_test.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 AND MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_MIGRATE_TEST_H_ +#define _XE_MIGRATE_TEST_H_ + +struct kunit; + +void xe_migrate_sanity_kunit(struct kunit *test); + +#endif diff --git a/drivers/gpu/drm/xe/tests/xe_mocs.c b/drivers/gpu/drm/xe/tests/xe_mocs.c new file mode 100644 index 000000000000..7dd34f94e809 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_mocs.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 AND MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include <kunit/test.h> +#include <kunit/visibility.h> + +#include "tests/xe_mocs_test.h" +#include "tests/xe_pci_test.h" +#include "tests/xe_test.h" + +#include "xe_pci.h" +#include "xe_gt.h" +#include "xe_mocs.h" +#include "xe_device.h" + +struct live_mocs { + struct xe_mocs_info table; +}; + +static int live_mocs_init(struct live_mocs *arg, struct xe_gt *gt) +{ + unsigned int flags; + struct kunit *test = xe_cur_kunit(); + + memset(arg, 0, sizeof(*arg)); + + flags = get_mocs_settings(gt_to_xe(gt), &arg->table); + + kunit_info(test, "table size %d", arg->table.size); + kunit_info(test, "table uc_index %d", arg->table.uc_index); + kunit_info(test, "table n_entries %d", arg->table.n_entries); + + return flags; +} + +static void read_l3cc_table(struct xe_gt *gt, + const struct xe_mocs_info *info) +{ + unsigned int i; + u32 l3cc; + u32 reg_val; + u32 ret; + + struct kunit *test = xe_cur_kunit(); + + xe_device_mem_access_get(gt_to_xe(gt)); + ret = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + KUNIT_ASSERT_EQ_MSG(test, ret, 0, "Forcewake Failed.\n"); + mocs_dbg(>_to_xe(gt)->drm, "L3CC entries:%d\n", info->n_entries); + for (i = 0; + i < (info->n_entries + 1) / 2 ? + (l3cc = l3cc_combine(get_entry_l3cc(info, 2 * i), + get_entry_l3cc(info, 2 * i + 1))), 1 : 0; + i++) { + if (GRAPHICS_VERx100(gt_to_xe(gt)) >= 1250) + reg_val = xe_gt_mcr_unicast_read_any(gt, XEHP_LNCFCMOCS(i)); + else + reg_val = xe_mmio_read32(gt, XELP_LNCFCMOCS(i)); + mocs_dbg(>_to_xe(gt)->drm, "%d 0x%x 0x%x 0x%x\n", i, + XELP_LNCFCMOCS(i).addr, reg_val, l3cc); + if (reg_val != l3cc) + KUNIT_FAIL(test, "l3cc reg 0x%x has incorrect val.\n", + XELP_LNCFCMOCS(i).addr); + } + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); + xe_device_mem_access_put(gt_to_xe(gt)); +} + +static void read_mocs_table(struct xe_gt *gt, + const struct xe_mocs_info *info) +{ + struct xe_device *xe = gt_to_xe(gt); + + unsigned int i; + u32 mocs; + u32 reg_val; + u32 ret; + + struct kunit *test = xe_cur_kunit(); + + xe_device_mem_access_get(gt_to_xe(gt)); + ret = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + KUNIT_ASSERT_EQ_MSG(test, ret, 0, "Forcewake Failed.\n"); + mocs_dbg(>_to_xe(gt)->drm, "Global MOCS entries:%d\n", info->n_entries); + drm_WARN_ONCE(&xe->drm, !info->unused_entries_index, + "Unused entries index should have been defined\n"); + for (i = 0; + i < info->n_entries ? (mocs = get_entry_control(info, i)), 1 : 0; + i++) { + if (GRAPHICS_VERx100(gt_to_xe(gt)) >= 1250) + reg_val = xe_gt_mcr_unicast_read_any(gt, XEHP_GLOBAL_MOCS(i)); + else + reg_val = xe_mmio_read32(gt, XELP_GLOBAL_MOCS(i)); + mocs_dbg(>_to_xe(gt)->drm, "%d 0x%x 0x%x 0x%x\n", i, + XELP_GLOBAL_MOCS(i).addr, reg_val, mocs); + if (reg_val != mocs) + KUNIT_FAIL(test, "mocs reg 0x%x has incorrect val.\n", + XELP_GLOBAL_MOCS(i).addr); + } + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); + xe_device_mem_access_put(gt_to_xe(gt)); +} + +static int mocs_kernel_test_run_device(struct xe_device *xe) +{ + /* Basic check the system is configured with the expected mocs table */ + + struct live_mocs mocs; + struct xe_gt *gt; + + unsigned int flags; + int id; + + for_each_gt(gt, xe, id) { + flags = live_mocs_init(&mocs, gt); + if (flags & HAS_GLOBAL_MOCS) + read_mocs_table(gt, &mocs.table); + if (flags & HAS_LNCF_MOCS) + read_l3cc_table(gt, &mocs.table); + } + return 0; +} + +void xe_live_mocs_kernel_kunit(struct kunit *test) +{ + xe_call_for_each_device(mocs_kernel_test_run_device); +} +EXPORT_SYMBOL_IF_KUNIT(xe_live_mocs_kernel_kunit); diff --git a/drivers/gpu/drm/xe/tests/xe_mocs_test.c b/drivers/gpu/drm/xe/tests/xe_mocs_test.c new file mode 100644 index 000000000000..ef56bd517b28 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_mocs_test.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_mocs_test.h" + +#include <kunit/test.h> + +static struct kunit_case xe_mocs_tests[] = { + KUNIT_CASE(xe_live_mocs_kernel_kunit), + {} +}; + +static struct kunit_suite xe_mocs_test_suite = { + .name = "xe_mocs", + .test_cases = xe_mocs_tests, +}; + +kunit_test_suite(xe_mocs_test_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); diff --git a/drivers/gpu/drm/xe/tests/xe_mocs_test.h b/drivers/gpu/drm/xe/tests/xe_mocs_test.h new file mode 100644 index 000000000000..7faa3575e6c3 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_mocs_test.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 AND MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_MOCS_TEST_H_ +#define _XE_MOCS_TEST_H_ + +struct kunit; + +void xe_live_mocs_kernel_kunit(struct kunit *test); + +#endif diff --git a/drivers/gpu/drm/xe/tests/xe_pci.c b/drivers/gpu/drm/xe/tests/xe_pci.c new file mode 100644 index 000000000000..602793644f61 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_pci.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 AND MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "tests/xe_pci_test.h" + +#include "tests/xe_test.h" + +#include <kunit/test-bug.h> +#include <kunit/test.h> +#include <kunit/test-bug.h> +#include <kunit/visibility.h> + +struct kunit_test_data { + int ndevs; + xe_device_fn xe_fn; +}; + +static int dev_to_xe_device_fn(struct device *dev, void *__data) + +{ + struct drm_device *drm = dev_get_drvdata(dev); + struct kunit_test_data *data = __data; + int ret = 0; + int idx; + + data->ndevs++; + + if (drm_dev_enter(drm, &idx)) + ret = data->xe_fn(to_xe_device(dev_get_drvdata(dev))); + drm_dev_exit(idx); + + return ret; +} + +/** + * xe_call_for_each_device - Iterate over all devices this driver binds to + * @xe_fn: Function to call for each device. + * + * This function iterated over all devices this driver binds to, and calls + * @xe_fn: for each one of them. If the called function returns anything else + * than 0, iteration is stopped and the return value is returned by this + * function. Across each function call, drm_dev_enter() / drm_dev_exit() is + * called for the corresponding drm device. + * + * Return: Number of devices iterated or + * the error code of a call to @xe_fn returning an error code. + */ +int xe_call_for_each_device(xe_device_fn xe_fn) +{ + int ret; + struct kunit_test_data data = { + .xe_fn = xe_fn, + .ndevs = 0, + }; + + ret = driver_for_each_device(&xe_pci_driver.driver, NULL, + &data, dev_to_xe_device_fn); + + if (!data.ndevs) + kunit_skip(current->kunit_test, "test runs only on hardware\n"); + + return ret ?: data.ndevs; +} + +/** + * xe_call_for_each_graphics_ip - Iterate over all recognized graphics IPs + * @xe_fn: Function to call for each device. + * + * This function iterates over the descriptors for all graphics IPs recognized + * by the driver and calls @xe_fn: for each one of them. + */ +void xe_call_for_each_graphics_ip(xe_graphics_fn xe_fn) +{ + const struct xe_graphics_desc *ip, *last = NULL; + + for (int i = 0; i < ARRAY_SIZE(graphics_ip_map); i++) { + ip = graphics_ip_map[i].ip; + if (ip == last) + continue; + + xe_fn(ip); + last = ip; + } +} +EXPORT_SYMBOL_IF_KUNIT(xe_call_for_each_graphics_ip); + +/** + * xe_call_for_each_media_ip - Iterate over all recognized media IPs + * @xe_fn: Function to call for each device. + * + * This function iterates over the descriptors for all media IPs recognized + * by the driver and calls @xe_fn: for each one of them. + */ +void xe_call_for_each_media_ip(xe_media_fn xe_fn) +{ + const struct xe_media_desc *ip, *last = NULL; + + for (int i = 0; i < ARRAY_SIZE(media_ip_map); i++) { + ip = media_ip_map[i].ip; + if (ip == last) + continue; + + xe_fn(ip); + last = ip; + } +} +EXPORT_SYMBOL_IF_KUNIT(xe_call_for_each_media_ip); + +static void fake_read_gmdid(struct xe_device *xe, enum xe_gmdid_type type, + u32 *ver, u32 *revid) +{ + struct kunit *test = kunit_get_current_test(); + struct xe_pci_fake_data *data = test->priv; + + if (type == GMDID_MEDIA) { + *ver = data->media_verx100; + *revid = xe_step_to_gmdid(data->media_step); + } else { + *ver = data->graphics_verx100; + *revid = xe_step_to_gmdid(data->graphics_step); + } +} + +int xe_pci_fake_device_init(struct xe_device *xe) +{ + struct kunit *test = kunit_get_current_test(); + struct xe_pci_fake_data *data = test->priv; + const struct pci_device_id *ent = pciidlist; + const struct xe_device_desc *desc; + const struct xe_subplatform_desc *subplatform_desc; + + if (!data) { + desc = (const void *)ent->driver_data; + subplatform_desc = NULL; + goto done; + } + + for (ent = pciidlist; ent->device; ent++) { + desc = (const void *)ent->driver_data; + if (desc->platform == data->platform) + break; + } + + if (!ent->device) + return -ENODEV; + + for (subplatform_desc = desc->subplatforms; + subplatform_desc && subplatform_desc->subplatform; + subplatform_desc++) + if (subplatform_desc->subplatform == data->subplatform) + break; + + if (data->subplatform != XE_SUBPLATFORM_NONE && !subplatform_desc) + return -ENODEV; + +done: + kunit_activate_static_stub(test, read_gmdid, fake_read_gmdid); + + xe_info_init_early(xe, desc, subplatform_desc); + xe_info_init(xe, desc->graphics, desc->media); + + return 0; +} +EXPORT_SYMBOL_IF_KUNIT(xe_pci_fake_device_init); diff --git a/drivers/gpu/drm/xe/tests/xe_pci_test.c b/drivers/gpu/drm/xe/tests/xe_pci_test.c new file mode 100644 index 000000000000..171e4180f1aa --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_pci_test.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2023 Intel Corporation + */ + +#include <drm/drm_drv.h> +#include <drm/drm_kunit_helpers.h> + +#include <kunit/test.h> + +#include "tests/xe_test.h" + +#include "xe_device.h" +#include "xe_pci_test.h" +#include "xe_pci_types.h" + +static void check_graphics_ip(const struct xe_graphics_desc *graphics) +{ + struct kunit *test = xe_cur_kunit(); + u64 mask = graphics->hw_engine_mask; + + /* RCS, CCS, and BCS engines are allowed on the graphics IP */ + mask &= ~(XE_HW_ENGINE_RCS_MASK | + XE_HW_ENGINE_CCS_MASK | + XE_HW_ENGINE_BCS_MASK); + + /* Any remaining engines are an error */ + KUNIT_ASSERT_EQ(test, mask, 0); +} + +static void check_media_ip(const struct xe_media_desc *media) +{ + struct kunit *test = xe_cur_kunit(); + u64 mask = media->hw_engine_mask; + + /* VCS, VECS and GSCCS engines are allowed on the media IP */ + mask &= ~(XE_HW_ENGINE_VCS_MASK | + XE_HW_ENGINE_VECS_MASK | + XE_HW_ENGINE_GSCCS_MASK); + + /* Any remaining engines are an error */ + KUNIT_ASSERT_EQ(test, mask, 0); +} + +static void xe_gmdid_graphics_ip(struct kunit *test) +{ + xe_call_for_each_graphics_ip(check_graphics_ip); +} + +static void xe_gmdid_media_ip(struct kunit *test) +{ + xe_call_for_each_media_ip(check_media_ip); +} + +static struct kunit_case xe_pci_tests[] = { + KUNIT_CASE(xe_gmdid_graphics_ip), + KUNIT_CASE(xe_gmdid_media_ip), + {} +}; + +static struct kunit_suite xe_pci_test_suite = { + .name = "xe_pci", + .test_cases = xe_pci_tests, +}; + +kunit_test_suite(xe_pci_test_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xe_pci kunit test"); +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); diff --git a/drivers/gpu/drm/xe/tests/xe_pci_test.h b/drivers/gpu/drm/xe/tests/xe_pci_test.h new file mode 100644 index 000000000000..811ffe5bd9fd --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_pci_test.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 AND MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_PCI_TEST_H_ +#define _XE_PCI_TEST_H_ + +#include <linux/types.h> + +#include "xe_platform_types.h" + +struct xe_device; +struct xe_graphics_desc; +struct xe_media_desc; + +typedef int (*xe_device_fn)(struct xe_device *); +typedef void (*xe_graphics_fn)(const struct xe_graphics_desc *); +typedef void (*xe_media_fn)(const struct xe_media_desc *); + +int xe_call_for_each_device(xe_device_fn xe_fn); +void xe_call_for_each_graphics_ip(xe_graphics_fn xe_fn); +void xe_call_for_each_media_ip(xe_media_fn xe_fn); + +struct xe_pci_fake_data { + enum xe_platform platform; + enum xe_subplatform subplatform; + u32 graphics_verx100; + u32 media_verx100; + u32 graphics_step; + u32 media_step; +}; + +int xe_pci_fake_device_init(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/tests/xe_rtp_test.c b/drivers/gpu/drm/xe/tests/xe_rtp_test.c new file mode 100644 index 000000000000..4a6972897675 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_rtp_test.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/string.h> +#include <linux/xarray.h> + +#include <drm/drm_drv.h> +#include <drm/drm_kunit_helpers.h> + +#include <kunit/test.h> + +#include "regs/xe_gt_regs.h" +#include "regs/xe_reg_defs.h" +#include "xe_device.h" +#include "xe_device_types.h" +#include "xe_pci_test.h" +#include "xe_reg_sr.h" +#include "xe_rtp.h" + +#define REGULAR_REG1 XE_REG(1) +#define REGULAR_REG2 XE_REG(2) +#define REGULAR_REG3 XE_REG(3) +#define MCR_REG1 XE_REG_MCR(1) +#define MCR_REG2 XE_REG_MCR(2) +#define MCR_REG3 XE_REG_MCR(3) +#define MASKED_REG1 XE_REG(1, XE_REG_OPTION_MASKED) + +#undef XE_REG_MCR +#define XE_REG_MCR(...) XE_REG(__VA_ARGS__, .mcr = 1) + +struct rtp_test_case { + const char *name; + struct xe_reg expected_reg; + u32 expected_set_bits; + u32 expected_clr_bits; + unsigned long expected_count; + unsigned int expected_sr_errors; + const struct xe_rtp_entry_sr *entries; +}; + +static bool match_yes(const struct xe_gt *gt, const struct xe_hw_engine *hwe) +{ + return true; +} + +static bool match_no(const struct xe_gt *gt, const struct xe_hw_engine *hwe) +{ + return false; +} + +static const struct rtp_test_case cases[] = { + { + .name = "coalesce-same-reg", + .expected_reg = REGULAR_REG1, + .expected_set_bits = REG_BIT(0) | REG_BIT(1), + .expected_clr_bits = REG_BIT(0) | REG_BIT(1), + .expected_count = 1, + /* Different bits on the same register: create a single entry */ + .entries = (const struct xe_rtp_entry_sr[]) { + { XE_RTP_NAME("basic-1"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(0))) + }, + { XE_RTP_NAME("basic-2"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(1))) + }, + {} + }, + }, + { + .name = "no-match-no-add", + .expected_reg = REGULAR_REG1, + .expected_set_bits = REG_BIT(0), + .expected_clr_bits = REG_BIT(0), + .expected_count = 1, + /* Don't coalesce second entry since rules don't match */ + .entries = (const struct xe_rtp_entry_sr[]) { + { XE_RTP_NAME("basic-1"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(0))) + }, + { XE_RTP_NAME("basic-2"), + XE_RTP_RULES(FUNC(match_no)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(1))) + }, + {} + }, + }, + { + .name = "no-match-no-add-multiple-rules", + .expected_reg = REGULAR_REG1, + .expected_set_bits = REG_BIT(0), + .expected_clr_bits = REG_BIT(0), + .expected_count = 1, + /* Don't coalesce second entry due to one of the rules */ + .entries = (const struct xe_rtp_entry_sr[]) { + { XE_RTP_NAME("basic-1"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(0))) + }, + { XE_RTP_NAME("basic-2"), + XE_RTP_RULES(FUNC(match_yes), FUNC(match_no)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(1))) + }, + {} + }, + }, + { + .name = "two-regs-two-entries", + .expected_reg = REGULAR_REG1, + .expected_set_bits = REG_BIT(0), + .expected_clr_bits = REG_BIT(0), + .expected_count = 2, + /* Same bits on different registers are not coalesced */ + .entries = (const struct xe_rtp_entry_sr[]) { + { XE_RTP_NAME("basic-1"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(0))) + }, + { XE_RTP_NAME("basic-2"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG2, REG_BIT(0))) + }, + {} + }, + }, + { + .name = "clr-one-set-other", + .expected_reg = REGULAR_REG1, + .expected_set_bits = REG_BIT(0), + .expected_clr_bits = REG_BIT(1) | REG_BIT(0), + .expected_count = 1, + /* Check clr vs set actions on different bits */ + .entries = (const struct xe_rtp_entry_sr[]) { + { XE_RTP_NAME("basic-1"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(0))) + }, + { XE_RTP_NAME("basic-2"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(CLR(REGULAR_REG1, REG_BIT(1))) + }, + {} + }, + }, + { +#define TEMP_MASK REG_GENMASK(10, 8) +#define TEMP_FIELD REG_FIELD_PREP(TEMP_MASK, 2) + .name = "set-field", + .expected_reg = REGULAR_REG1, + .expected_set_bits = TEMP_FIELD, + .expected_clr_bits = TEMP_MASK, + .expected_count = 1, + /* Check FIELD_SET works */ + .entries = (const struct xe_rtp_entry_sr[]) { + { XE_RTP_NAME("basic-1"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(FIELD_SET(REGULAR_REG1, + TEMP_MASK, TEMP_FIELD)) + }, + {} + }, +#undef TEMP_MASK +#undef TEMP_FIELD + }, + { + .name = "conflict-duplicate", + .expected_reg = REGULAR_REG1, + .expected_set_bits = REG_BIT(0), + .expected_clr_bits = REG_BIT(0), + .expected_count = 1, + .expected_sr_errors = 1, + .entries = (const struct xe_rtp_entry_sr[]) { + { XE_RTP_NAME("basic-1"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(0))) + }, + /* drop: setting same values twice */ + { XE_RTP_NAME("basic-2"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(0))) + }, + {} + }, + }, + { + .name = "conflict-not-disjoint", + .expected_reg = REGULAR_REG1, + .expected_set_bits = REG_BIT(0), + .expected_clr_bits = REG_BIT(0), + .expected_count = 1, + .expected_sr_errors = 1, + .entries = (const struct xe_rtp_entry_sr[]) { + { XE_RTP_NAME("basic-1"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(0))) + }, + /* drop: bits are not disjoint with previous entries */ + { XE_RTP_NAME("basic-2"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(CLR(REGULAR_REG1, REG_GENMASK(1, 0))) + }, + {} + }, + }, + { + .name = "conflict-reg-type", + .expected_reg = REGULAR_REG1, + .expected_set_bits = REG_BIT(0), + .expected_clr_bits = REG_BIT(0), + .expected_count = 1, + .expected_sr_errors = 2, + .entries = (const struct xe_rtp_entry_sr[]) { + { XE_RTP_NAME("basic-1"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(REGULAR_REG1, REG_BIT(0))) + }, + /* drop: regular vs MCR */ + { XE_RTP_NAME("basic-2"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(MCR_REG1, REG_BIT(1))) + }, + /* drop: regular vs masked */ + { XE_RTP_NAME("basic-3"), + XE_RTP_RULES(FUNC(match_yes)), + XE_RTP_ACTIONS(SET(MASKED_REG1, REG_BIT(0))) + }, + {} + }, + }, +}; + +static void xe_rtp_process_tests(struct kunit *test) +{ + const struct rtp_test_case *param = test->param_value; + struct xe_device *xe = test->priv; + struct xe_gt *gt = xe_device_get_root_tile(xe)->primary_gt; + struct xe_reg_sr *reg_sr = >->reg_sr; + const struct xe_reg_sr_entry *sre, *sr_entry = NULL; + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(gt); + unsigned long idx, count = 0; + + xe_reg_sr_init(reg_sr, "xe_rtp_tests", xe); + xe_rtp_process_to_sr(&ctx, param->entries, reg_sr); + + xa_for_each(®_sr->xa, idx, sre) { + if (idx == param->expected_reg.addr) + sr_entry = sre; + + count++; + } + + KUNIT_EXPECT_EQ(test, count, param->expected_count); + KUNIT_EXPECT_EQ(test, sr_entry->clr_bits, param->expected_clr_bits); + KUNIT_EXPECT_EQ(test, sr_entry->set_bits, param->expected_set_bits); + KUNIT_EXPECT_EQ(test, sr_entry->reg.raw, param->expected_reg.raw); + KUNIT_EXPECT_EQ(test, reg_sr->errors, param->expected_sr_errors); +} + +static void rtp_desc(const struct rtp_test_case *t, char *desc) +{ + strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(rtp, cases, rtp_desc); + +static int xe_rtp_test_init(struct kunit *test) +{ + struct xe_device *xe; + struct device *dev; + int ret; + + dev = drm_kunit_helper_alloc_device(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + xe = drm_kunit_helper_alloc_drm_device(test, dev, + struct xe_device, + drm, DRIVER_GEM); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, xe); + + /* Initialize an empty device */ + test->priv = NULL; + ret = xe_pci_fake_device_init(xe); + KUNIT_ASSERT_EQ(test, ret, 0); + + xe->drm.dev = dev; + test->priv = xe; + + return 0; +} + +static void xe_rtp_test_exit(struct kunit *test) +{ + struct xe_device *xe = test->priv; + + drm_kunit_helper_free_device(test, xe->drm.dev); +} + +static struct kunit_case xe_rtp_tests[] = { + KUNIT_CASE_PARAM(xe_rtp_process_tests, rtp_gen_params), + {} +}; + +static struct kunit_suite xe_rtp_test_suite = { + .name = "xe_rtp", + .init = xe_rtp_test_init, + .exit = xe_rtp_test_exit, + .test_cases = xe_rtp_tests, +}; + +kunit_test_suite(xe_rtp_test_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xe_rtp kunit test"); +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); diff --git a/drivers/gpu/drm/xe/tests/xe_test.h b/drivers/gpu/drm/xe/tests/xe_test.h new file mode 100644 index 000000000000..7a1ae213e750 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_test.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 AND MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_TEST_H_ +#define _XE_TEST_H_ + +#include <linux/types.h> + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) +#include <linux/sched.h> +#include <kunit/test.h> + +/* + * Each test that provides a kunit private test structure, place a test id + * here and point the kunit->priv to an embedded struct xe_test_priv. + */ +enum xe_test_priv_id { + XE_TEST_LIVE_DMA_BUF, + XE_TEST_LIVE_MIGRATE, +}; + +/** + * struct xe_test_priv - Base class for test private info + * @id: enum xe_test_priv_id to identify the subclass. + */ +struct xe_test_priv { + enum xe_test_priv_id id; +}; + +#define XE_TEST_DECLARE(x) x +#define XE_TEST_ONLY(x) unlikely(x) +#define XE_TEST_EXPORT +#define xe_cur_kunit() current->kunit_test + +/** + * xe_cur_kunit_priv - Obtain the struct xe_test_priv pointed to by + * current->kunit->priv if it exists and is embedded in the expected subclass. + * @id: Id of the expected subclass. + * + * Return: NULL if the process is not a kunit test, and NULL if the + * current kunit->priv pointer is not pointing to an object of the expected + * subclass. A pointer to the embedded struct xe_test_priv otherwise. + */ +static inline struct xe_test_priv * +xe_cur_kunit_priv(enum xe_test_priv_id id) +{ + struct xe_test_priv *priv; + + if (!xe_cur_kunit()) + return NULL; + + priv = xe_cur_kunit()->priv; + return priv->id == id ? priv : NULL; +} + +#else /* if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) */ + +#define XE_TEST_DECLARE(x) +#define XE_TEST_ONLY(x) 0 +#define XE_TEST_EXPORT static +#define xe_cur_kunit() NULL +#define xe_cur_kunit_priv(_id) NULL + +#endif +#endif diff --git a/drivers/gpu/drm/xe/tests/xe_wa_test.c b/drivers/gpu/drm/xe/tests/xe_wa_test.c new file mode 100644 index 000000000000..a53c22a19582 --- /dev/null +++ b/drivers/gpu/drm/xe/tests/xe_wa_test.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2023 Intel Corporation + */ + +#include <drm/drm_drv.h> +#include <drm/drm_kunit_helpers.h> + +#include <kunit/test.h> + +#include "xe_device.h" +#include "xe_pci_test.h" +#include "xe_reg_sr.h" +#include "xe_tuning.h" +#include "xe_wa.h" + +struct platform_test_case { + const char *name; + enum xe_platform platform; + enum xe_subplatform subplatform; + u32 graphics_verx100; + u32 media_verx100; + struct xe_step_info step; +}; + +#define PLATFORM_CASE(platform__, graphics_step__) \ + { \ + .name = #platform__ " (" #graphics_step__ ")", \ + .platform = XE_ ## platform__, \ + .subplatform = XE_SUBPLATFORM_NONE, \ + .step = { .graphics = STEP_ ## graphics_step__ } \ + } + + +#define SUBPLATFORM_CASE(platform__, subplatform__, graphics_step__) \ + { \ + .name = #platform__ "_" #subplatform__ " (" #graphics_step__ ")", \ + .platform = XE_ ## platform__, \ + .subplatform = XE_SUBPLATFORM_ ## platform__ ## _ ## subplatform__, \ + .step = { .graphics = STEP_ ## graphics_step__ } \ + } + +#define GMDID_CASE(platform__, graphics_verx100__, graphics_step__, \ + media_verx100__, media_step__) \ + { \ + .name = #platform__ " (g:" #graphics_step__ ", m:" #media_step__ ")",\ + .platform = XE_ ## platform__, \ + .subplatform = XE_SUBPLATFORM_NONE, \ + .graphics_verx100 = graphics_verx100__, \ + .media_verx100 = media_verx100__, \ + .step = { .graphics = STEP_ ## graphics_step__, \ + .media = STEP_ ## media_step__ } \ + } + +static const struct platform_test_case cases[] = { + PLATFORM_CASE(TIGERLAKE, B0), + PLATFORM_CASE(DG1, A0), + PLATFORM_CASE(DG1, B0), + PLATFORM_CASE(ALDERLAKE_S, A0), + PLATFORM_CASE(ALDERLAKE_S, B0), + PLATFORM_CASE(ALDERLAKE_S, C0), + PLATFORM_CASE(ALDERLAKE_S, D0), + PLATFORM_CASE(ALDERLAKE_P, A0), + PLATFORM_CASE(ALDERLAKE_P, B0), + PLATFORM_CASE(ALDERLAKE_P, C0), + SUBPLATFORM_CASE(ALDERLAKE_S, RPLS, D0), + SUBPLATFORM_CASE(ALDERLAKE_P, RPLU, E0), + SUBPLATFORM_CASE(DG2, G10, A0), + SUBPLATFORM_CASE(DG2, G10, A1), + SUBPLATFORM_CASE(DG2, G10, B0), + SUBPLATFORM_CASE(DG2, G10, C0), + SUBPLATFORM_CASE(DG2, G11, A0), + SUBPLATFORM_CASE(DG2, G11, B0), + SUBPLATFORM_CASE(DG2, G11, B1), + SUBPLATFORM_CASE(DG2, G12, A0), + SUBPLATFORM_CASE(DG2, G12, A1), + PLATFORM_CASE(PVC, B0), + PLATFORM_CASE(PVC, B1), + PLATFORM_CASE(PVC, C0), + GMDID_CASE(METEORLAKE, 1270, A0, 1300, A0), + GMDID_CASE(METEORLAKE, 1271, A0, 1300, A0), + GMDID_CASE(LUNARLAKE, 2004, A0, 2000, A0), + GMDID_CASE(LUNARLAKE, 2004, B0, 2000, A0), +}; + +static void platform_desc(const struct platform_test_case *t, char *desc) +{ + strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(platform, cases, platform_desc); + +static int xe_wa_test_init(struct kunit *test) +{ + const struct platform_test_case *param = test->param_value; + struct xe_pci_fake_data data = { + .platform = param->platform, + .subplatform = param->subplatform, + .graphics_verx100 = param->graphics_verx100, + .media_verx100 = param->media_verx100, + .graphics_step = param->step.graphics, + .media_step = param->step.media, + }; + struct xe_device *xe; + struct device *dev; + int ret; + + dev = drm_kunit_helper_alloc_device(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + xe = drm_kunit_helper_alloc_drm_device(test, dev, + struct xe_device, + drm, DRIVER_GEM); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, xe); + + test->priv = &data; + ret = xe_pci_fake_device_init(xe); + KUNIT_ASSERT_EQ(test, ret, 0); + + if (!param->graphics_verx100) + xe->info.step = param->step; + + /* TODO: init hw engines for engine/LRC WAs */ + xe->drm.dev = dev; + test->priv = xe; + + return 0; +} + +static void xe_wa_test_exit(struct kunit *test) +{ + struct xe_device *xe = test->priv; + + drm_kunit_helper_free_device(test, xe->drm.dev); +} + +static void xe_wa_gt(struct kunit *test) +{ + struct xe_device *xe = test->priv; + struct xe_gt *gt; + int id; + + for_each_gt(gt, xe, id) { + xe_reg_sr_init(>->reg_sr, "GT", xe); + + xe_wa_process_gt(gt); + xe_tuning_process_gt(gt); + + KUNIT_ASSERT_EQ(test, gt->reg_sr.errors, 0); + } +} + +static struct kunit_case xe_wa_tests[] = { + KUNIT_CASE_PARAM(xe_wa_gt, platform_gen_params), + {} +}; + +static struct kunit_suite xe_rtp_test_suite = { + .name = "xe_wa", + .init = xe_wa_test_init, + .exit = xe_wa_test_exit, + .test_cases = xe_wa_tests, +}; + +kunit_test_suite(xe_rtp_test_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xe_wa kunit test"); +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); diff --git a/drivers/gpu/drm/xe/xe_assert.h b/drivers/gpu/drm/xe/xe_assert.h new file mode 100644 index 000000000000..34c142e6cfb0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_assert.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_ASSERT_H_ +#define _XE_ASSERT_H_ + +#include <linux/string_helpers.h> + +#include <drm/drm_print.h> + +#include "xe_device_types.h" +#include "xe_step.h" + +/** + * DOC: Xe ASSERTs + * + * While Xe driver aims to be simpler than legacy i915 driver it is still + * complex enough that some changes introduced while adding new functionality + * could break the existing code. + * + * Adding &drm_WARN or &drm_err to catch unwanted programming usage could lead + * to undesired increased driver footprint and may impact production driver + * performance as this additional code will be always present. + * + * To allow annotate functions with additional detailed debug checks to assert + * that all prerequisites are satisfied, without worrying about footprint or + * performance penalty on production builds where all potential misuses + * introduced during code integration were already fixed, we introduce family + * of Xe assert macros that try to follow classic assert() utility: + * + * * xe_assert() + * * xe_tile_assert() + * * xe_gt_assert() + * + * These macros are implemented on top of &drm_WARN, but unlikely to the origin, + * warning is triggered when provided condition is false. Additionally all above + * assert macros cannot be used in expressions or as a condition, since + * underlying code will be compiled out on non-debug builds. + * + * Note that these macros are not intended for use to cover known gaps in the + * implementation; for such cases use regular &drm_WARN or &drm_err and provide + * valid safe fallback. + * + * Also in cases where performance or footprint is not an issue, developers + * should continue to use the regular &drm_WARN or &drm_err to ensure that bug + * reports from production builds will contain meaningful diagnostics data. + * + * Below code shows how asserts could help in debug to catch unplanned use:: + * + * static void one_igfx(struct xe_device *xe) + * { + * xe_assert(xe, xe->info.is_dgfx == false); + * xe_assert(xe, xe->info.tile_count == 1); + * } + * + * static void two_dgfx(struct xe_device *xe) + * { + * xe_assert(xe, xe->info.is_dgfx); + * xe_assert(xe, xe->info.tile_count == 2); + * } + * + * void foo(struct xe_device *xe) + * { + * if (xe->info.dgfx) + * return two_dgfx(xe); + * return one_igfx(xe); + * } + * + * void bar(struct xe_device *xe) + * { + * if (drm_WARN_ON(xe->drm, xe->info.tile_count > 2)) + * return; + * + * if (xe->info.tile_count == 2) + * return two_dgfx(xe); + * return one_igfx(xe); + * } + */ + +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG) +#define __xe_assert_msg(xe, condition, msg, arg...) ({ \ + (void)drm_WARN(&(xe)->drm, !(condition), "[" DRM_NAME "] Assertion `%s` failed!\n" msg, \ + __stringify(condition), ## arg); \ +}) +#else +#define __xe_assert_msg(xe, condition, msg, arg...) ({ \ + typecheck(const struct xe_device *, xe); \ + BUILD_BUG_ON_INVALID(condition); \ +}) +#endif + +/** + * xe_assert - warn if condition is false when debugging. + * @xe: the &struct xe_device pointer to which &condition applies + * @condition: condition to check + * + * xe_assert() uses &drm_WARN to emit a warning and print additional information + * that could be read from the &xe pointer if provided &condition is false. + * + * Contrary to &drm_WARN, xe_assert() is effective only on debug builds + * (&CONFIG_DRM_XE_DEBUG must be enabled) and cannot be used in expressions + * or as a condition. + * + * See `Xe ASSERTs`_ for general usage guidelines. + */ +#define xe_assert(xe, condition) xe_assert_msg((xe), condition, "") +#define xe_assert_msg(xe, condition, msg, arg...) ({ \ + const struct xe_device *__xe = (xe); \ + __xe_assert_msg(__xe, condition, \ + "platform: %d subplatform: %d\n" \ + "graphics: %s %u.%02u step %s\n" \ + "media: %s %u.%02u step %s\n" \ + msg, \ + __xe->info.platform, __xe->info.subplatform, \ + __xe->info.graphics_name, \ + __xe->info.graphics_verx100 / 100, \ + __xe->info.graphics_verx100 % 100, \ + xe_step_name(__xe->info.step.graphics), \ + __xe->info.media_name, \ + __xe->info.media_verx100 / 100, \ + __xe->info.media_verx100 % 100, \ + xe_step_name(__xe->info.step.media), \ + ## arg); \ +}) + +/** + * xe_tile_assert - warn if condition is false when debugging. + * @tile: the &struct xe_tile pointer to which &condition applies + * @condition: condition to check + * + * xe_tile_assert() uses &drm_WARN to emit a warning and print additional + * information that could be read from the &tile pointer if provided &condition + * is false. + * + * Contrary to &drm_WARN, xe_tile_assert() is effective only on debug builds + * (&CONFIG_DRM_XE_DEBUG must be enabled) and cannot be used in expressions + * or as a condition. + * + * See `Xe ASSERTs`_ for general usage guidelines. + */ +#define xe_tile_assert(tile, condition) xe_tile_assert_msg((tile), condition, "") +#define xe_tile_assert_msg(tile, condition, msg, arg...) ({ \ + const struct xe_tile *__tile = (tile); \ + char __buf[10] __maybe_unused; \ + xe_assert_msg(tile_to_xe(__tile), condition, "tile: %u VRAM %s\n" msg, \ + __tile->id, ({ string_get_size(__tile->mem.vram.actual_physical_size, 1, \ + STRING_UNITS_2, __buf, sizeof(__buf)); __buf; }), ## arg); \ +}) + +/** + * xe_gt_assert - warn if condition is false when debugging. + * @gt: the &struct xe_gt pointer to which &condition applies + * @condition: condition to check + * + * xe_gt_assert() uses &drm_WARN to emit a warning and print additional + * information that could be safetely read from the > pointer if provided + * &condition is false. + * + * Contrary to &drm_WARN, xe_gt_assert() is effective only on debug builds + * (&CONFIG_DRM_XE_DEBUG must be enabled) and cannot be used in expressions + * or as a condition. + * + * See `Xe ASSERTs`_ for general usage guidelines. + */ +#define xe_gt_assert(gt, condition) xe_gt_assert_msg((gt), condition, "") +#define xe_gt_assert_msg(gt, condition, msg, arg...) ({ \ + const struct xe_gt *__gt = (gt); \ + xe_tile_assert_msg(gt_to_tile(__gt), condition, "GT: %u type %d\n" msg, \ + __gt->info.id, __gt->info.type, ## arg); \ +}) + +#endif diff --git a/drivers/gpu/drm/xe/xe_bb.c b/drivers/gpu/drm/xe/xe_bb.c new file mode 100644 index 000000000000..7c124475c428 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_bb.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_bb.h" + +#include "instructions/xe_mi_commands.h" +#include "regs/xe_gpu_commands.h" +#include "xe_device.h" +#include "xe_exec_queue_types.h" +#include "xe_gt.h" +#include "xe_hw_fence.h" +#include "xe_sa.h" +#include "xe_sched_job.h" +#include "xe_vm_types.h" + +static int bb_prefetch(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + + if (GRAPHICS_VERx100(xe) >= 1250 && !xe_gt_is_media_type(gt)) + /* + * RCS and CCS require 1K, although other engines would be + * okay with 512. + */ + return SZ_1K; + else + return SZ_512; +} + +struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 dwords, bool usm) +{ + struct xe_tile *tile = gt_to_tile(gt); + struct xe_bb *bb = kmalloc(sizeof(*bb), GFP_KERNEL); + int err; + + if (!bb) + return ERR_PTR(-ENOMEM); + + /* + * We need to allocate space for the requested number of dwords, + * one additional MI_BATCH_BUFFER_END dword, and additional buffer + * space to accomodate the platform-specific hardware prefetch + * requirements. + */ + bb->bo = xe_sa_bo_new(!usm ? tile->mem.kernel_bb_pool : gt->usm.bb_pool, + 4 * (dwords + 1) + bb_prefetch(gt)); + if (IS_ERR(bb->bo)) { + err = PTR_ERR(bb->bo); + goto err; + } + + bb->cs = xe_sa_bo_cpu_addr(bb->bo); + bb->len = 0; + + return bb; +err: + kfree(bb); + return ERR_PTR(err); +} + +static struct xe_sched_job * +__xe_bb_create_job(struct xe_exec_queue *q, struct xe_bb *bb, u64 *addr) +{ + u32 size = drm_suballoc_size(bb->bo); + + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + + xe_gt_assert(q->gt, bb->len * 4 + bb_prefetch(q->gt) <= size); + + xe_sa_bo_flush_write(bb->bo); + + return xe_sched_job_create(q, addr); +} + +struct xe_sched_job *xe_bb_create_migration_job(struct xe_exec_queue *q, + struct xe_bb *bb, + u64 batch_base_ofs, + u32 second_idx) +{ + u64 addr[2] = { + batch_base_ofs + drm_suballoc_soffset(bb->bo), + batch_base_ofs + drm_suballoc_soffset(bb->bo) + + 4 * second_idx, + }; + + xe_gt_assert(q->gt, second_idx <= bb->len); + xe_gt_assert(q->gt, q->vm->flags & XE_VM_FLAG_MIGRATION); + + return __xe_bb_create_job(q, bb, addr); +} + +struct xe_sched_job *xe_bb_create_job(struct xe_exec_queue *q, + struct xe_bb *bb) +{ + u64 addr = xe_sa_bo_gpu_addr(bb->bo); + + xe_gt_assert(q->gt, !(q->vm && q->vm->flags & XE_VM_FLAG_MIGRATION)); + return __xe_bb_create_job(q, bb, &addr); +} + +void xe_bb_free(struct xe_bb *bb, struct dma_fence *fence) +{ + if (!bb) + return; + + xe_sa_bo_free(bb->bo, fence); + kfree(bb); +} diff --git a/drivers/gpu/drm/xe/xe_bb.h b/drivers/gpu/drm/xe/xe_bb.h new file mode 100644 index 000000000000..fafacd73dcc3 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_bb.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_BB_H_ +#define _XE_BB_H_ + +#include "xe_bb_types.h" + +struct dma_fence; + +struct xe_gt; +struct xe_exec_queue; +struct xe_sched_job; + +struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 size, bool usm); +struct xe_sched_job *xe_bb_create_job(struct xe_exec_queue *q, + struct xe_bb *bb); +struct xe_sched_job *xe_bb_create_migration_job(struct xe_exec_queue *q, + struct xe_bb *bb, u64 batch_ofs, + u32 second_idx); +void xe_bb_free(struct xe_bb *bb, struct dma_fence *fence); + +#endif diff --git a/drivers/gpu/drm/xe/xe_bb_types.h b/drivers/gpu/drm/xe/xe_bb_types.h new file mode 100644 index 000000000000..b7d30308cf90 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_bb_types.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_BB_TYPES_H_ +#define _XE_BB_TYPES_H_ + +#include <linux/types.h> + +struct drm_suballoc; + +struct xe_bb { + struct drm_suballoc *bo; + + u32 *cs; + u32 len; /* in dwords */ +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c new file mode 100644 index 000000000000..8e4a3b1f6b93 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -0,0 +1,2269 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_bo.h" + +#include <linux/dma-buf.h> + +#include <drm/drm_drv.h> +#include <drm/drm_gem_ttm_helper.h> +#include <drm/drm_managed.h> +#include <drm/ttm/ttm_device.h> +#include <drm/ttm/ttm_placement.h> +#include <drm/ttm/ttm_tt.h> +#include <drm/xe_drm.h> + +#include "xe_device.h" +#include "xe_dma_buf.h" +#include "xe_drm_client.h" +#include "xe_ggtt.h" +#include "xe_gt.h" +#include "xe_map.h" +#include "xe_migrate.h" +#include "xe_preempt_fence.h" +#include "xe_res_cursor.h" +#include "xe_trace.h" +#include "xe_ttm_stolen_mgr.h" +#include "xe_vm.h" + +static const struct ttm_place sys_placement_flags = { + .fpfn = 0, + .lpfn = 0, + .mem_type = XE_PL_SYSTEM, + .flags = 0, +}; + +static struct ttm_placement sys_placement = { + .num_placement = 1, + .placement = &sys_placement_flags, + .num_busy_placement = 1, + .busy_placement = &sys_placement_flags, +}; + +static const struct ttm_place tt_placement_flags = { + .fpfn = 0, + .lpfn = 0, + .mem_type = XE_PL_TT, + .flags = 0, +}; + +static struct ttm_placement tt_placement = { + .num_placement = 1, + .placement = &tt_placement_flags, + .num_busy_placement = 1, + .busy_placement = &sys_placement_flags, +}; + +bool mem_type_is_vram(u32 mem_type) +{ + return mem_type >= XE_PL_VRAM0 && mem_type != XE_PL_STOLEN; +} + +static bool resource_is_stolen_vram(struct xe_device *xe, struct ttm_resource *res) +{ + return res->mem_type == XE_PL_STOLEN && IS_DGFX(xe); +} + +static bool resource_is_vram(struct ttm_resource *res) +{ + return mem_type_is_vram(res->mem_type); +} + +bool xe_bo_is_vram(struct xe_bo *bo) +{ + return resource_is_vram(bo->ttm.resource) || + resource_is_stolen_vram(xe_bo_device(bo), bo->ttm.resource); +} + +bool xe_bo_is_stolen(struct xe_bo *bo) +{ + return bo->ttm.resource->mem_type == XE_PL_STOLEN; +} + +/** + * xe_bo_is_stolen_devmem - check if BO is of stolen type accessed via PCI BAR + * @bo: The BO + * + * The stolen memory is accessed through the PCI BAR for both DGFX and some + * integrated platforms that have a dedicated bit in the PTE for devmem (DM). + * + * Returns: true if it's stolen memory accessed via PCI BAR, false otherwise. + */ +bool xe_bo_is_stolen_devmem(struct xe_bo *bo) +{ + return xe_bo_is_stolen(bo) && + GRAPHICS_VERx100(xe_bo_device(bo)) >= 1270; +} + +static bool xe_bo_is_user(struct xe_bo *bo) +{ + return bo->flags & XE_BO_CREATE_USER_BIT; +} + +static struct xe_migrate * +mem_type_to_migrate(struct xe_device *xe, u32 mem_type) +{ + struct xe_tile *tile; + + xe_assert(xe, mem_type == XE_PL_STOLEN || mem_type_is_vram(mem_type)); + tile = &xe->tiles[mem_type == XE_PL_STOLEN ? 0 : (mem_type - XE_PL_VRAM0)]; + return tile->migrate; +} + +static struct xe_mem_region *res_to_mem_region(struct ttm_resource *res) +{ + struct xe_device *xe = ttm_to_xe_device(res->bo->bdev); + struct ttm_resource_manager *mgr; + + xe_assert(xe, resource_is_vram(res)); + mgr = ttm_manager_type(&xe->ttm, res->mem_type); + return to_xe_ttm_vram_mgr(mgr)->vram; +} + +static void try_add_system(struct xe_device *xe, struct xe_bo *bo, + u32 bo_flags, u32 *c) +{ + xe_assert(xe, *c < ARRAY_SIZE(bo->placements)); + + if (bo_flags & XE_BO_CREATE_SYSTEM_BIT) { + bo->placements[*c] = (struct ttm_place) { + .mem_type = XE_PL_TT, + }; + *c += 1; + + if (bo->props.preferred_mem_type == XE_BO_PROPS_INVALID) + bo->props.preferred_mem_type = XE_PL_TT; + } +} + +static void add_vram(struct xe_device *xe, struct xe_bo *bo, + struct ttm_place *places, u32 bo_flags, u32 mem_type, u32 *c) +{ + struct ttm_place place = { .mem_type = mem_type }; + struct xe_mem_region *vram; + u64 io_size; + + vram = to_xe_ttm_vram_mgr(ttm_manager_type(&xe->ttm, mem_type))->vram; + xe_assert(xe, vram && vram->usable_size); + io_size = vram->io_size; + + /* + * For eviction / restore on suspend / resume objects + * pinned in VRAM must be contiguous + */ + if (bo_flags & (XE_BO_CREATE_PINNED_BIT | + XE_BO_CREATE_GGTT_BIT)) + place.flags |= TTM_PL_FLAG_CONTIGUOUS; + + if (io_size < vram->usable_size) { + if (bo_flags & XE_BO_NEEDS_CPU_ACCESS) { + place.fpfn = 0; + place.lpfn = io_size >> PAGE_SHIFT; + } else { + place.flags |= TTM_PL_FLAG_TOPDOWN; + } + } + places[*c] = place; + *c += 1; + + if (bo->props.preferred_mem_type == XE_BO_PROPS_INVALID) + bo->props.preferred_mem_type = mem_type; +} + +static void try_add_vram(struct xe_device *xe, struct xe_bo *bo, + u32 bo_flags, u32 *c) +{ + xe_assert(xe, *c < ARRAY_SIZE(bo->placements)); + + if (bo->props.preferred_gt == XE_GT1) { + if (bo_flags & XE_BO_CREATE_VRAM1_BIT) + add_vram(xe, bo, bo->placements, bo_flags, XE_PL_VRAM1, c); + if (bo_flags & XE_BO_CREATE_VRAM0_BIT) + add_vram(xe, bo, bo->placements, bo_flags, XE_PL_VRAM0, c); + } else { + if (bo_flags & XE_BO_CREATE_VRAM0_BIT) + add_vram(xe, bo, bo->placements, bo_flags, XE_PL_VRAM0, c); + if (bo_flags & XE_BO_CREATE_VRAM1_BIT) + add_vram(xe, bo, bo->placements, bo_flags, XE_PL_VRAM1, c); + } +} + +static void try_add_stolen(struct xe_device *xe, struct xe_bo *bo, + u32 bo_flags, u32 *c) +{ + xe_assert(xe, *c < ARRAY_SIZE(bo->placements)); + + if (bo_flags & XE_BO_CREATE_STOLEN_BIT) { + bo->placements[*c] = (struct ttm_place) { + .mem_type = XE_PL_STOLEN, + .flags = bo_flags & (XE_BO_CREATE_PINNED_BIT | + XE_BO_CREATE_GGTT_BIT) ? + TTM_PL_FLAG_CONTIGUOUS : 0, + }; + *c += 1; + } +} + +static int __xe_bo_placement_for_flags(struct xe_device *xe, struct xe_bo *bo, + u32 bo_flags) +{ + u32 c = 0; + + bo->props.preferred_mem_type = XE_BO_PROPS_INVALID; + + /* The order of placements should indicate preferred location */ + + if (bo->props.preferred_mem_class == DRM_XE_MEM_REGION_CLASS_SYSMEM) { + try_add_system(xe, bo, bo_flags, &c); + try_add_vram(xe, bo, bo_flags, &c); + } else { + try_add_vram(xe, bo, bo_flags, &c); + try_add_system(xe, bo, bo_flags, &c); + } + try_add_stolen(xe, bo, bo_flags, &c); + + if (!c) + return -EINVAL; + + bo->placement = (struct ttm_placement) { + .num_placement = c, + .placement = bo->placements, + .num_busy_placement = c, + .busy_placement = bo->placements, + }; + + return 0; +} + +int xe_bo_placement_for_flags(struct xe_device *xe, struct xe_bo *bo, + u32 bo_flags) +{ + xe_bo_assert_held(bo); + return __xe_bo_placement_for_flags(xe, bo, bo_flags); +} + +static void xe_evict_flags(struct ttm_buffer_object *tbo, + struct ttm_placement *placement) +{ + if (!xe_bo_is_xe_bo(tbo)) { + /* Don't handle scatter gather BOs */ + if (tbo->type == ttm_bo_type_sg) { + placement->num_placement = 0; + placement->num_busy_placement = 0; + return; + } + + *placement = sys_placement; + return; + } + + /* + * For xe, sg bos that are evicted to system just triggers a + * rebind of the sg list upon subsequent validation to XE_PL_TT. + */ + switch (tbo->resource->mem_type) { + case XE_PL_VRAM0: + case XE_PL_VRAM1: + case XE_PL_STOLEN: + *placement = tt_placement; + break; + case XE_PL_TT: + default: + *placement = sys_placement; + break; + } +} + +struct xe_ttm_tt { + struct ttm_tt ttm; + struct device *dev; + struct sg_table sgt; + struct sg_table *sg; +}; + +static int xe_tt_map_sg(struct ttm_tt *tt) +{ + struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm); + unsigned long num_pages = tt->num_pages; + int ret; + + XE_WARN_ON(tt->page_flags & TTM_TT_FLAG_EXTERNAL); + + if (xe_tt->sg) + return 0; + + ret = sg_alloc_table_from_pages_segment(&xe_tt->sgt, tt->pages, + num_pages, 0, + (u64)num_pages << PAGE_SHIFT, + xe_sg_segment_size(xe_tt->dev), + GFP_KERNEL); + if (ret) + return ret; + + xe_tt->sg = &xe_tt->sgt; + ret = dma_map_sgtable(xe_tt->dev, xe_tt->sg, DMA_BIDIRECTIONAL, + DMA_ATTR_SKIP_CPU_SYNC); + if (ret) { + sg_free_table(xe_tt->sg); + xe_tt->sg = NULL; + return ret; + } + + return 0; +} + +struct sg_table *xe_bo_sg(struct xe_bo *bo) +{ + struct ttm_tt *tt = bo->ttm.ttm; + struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm); + + return xe_tt->sg; +} + +static struct ttm_tt *xe_ttm_tt_create(struct ttm_buffer_object *ttm_bo, + u32 page_flags) +{ + struct xe_bo *bo = ttm_to_xe_bo(ttm_bo); + struct xe_device *xe = xe_bo_device(bo); + struct xe_ttm_tt *tt; + unsigned long extra_pages; + enum ttm_caching caching; + int err; + + tt = kzalloc(sizeof(*tt), GFP_KERNEL); + if (!tt) + return NULL; + + tt->dev = xe->drm.dev; + + extra_pages = 0; + if (xe_bo_needs_ccs_pages(bo)) + extra_pages = DIV_ROUND_UP(xe_device_ccs_bytes(xe, bo->size), + PAGE_SIZE); + + switch (bo->cpu_caching) { + case DRM_XE_GEM_CPU_CACHING_WC: + caching = ttm_write_combined; + break; + default: + caching = ttm_cached; + break; + } + + WARN_ON((bo->flags & XE_BO_CREATE_USER_BIT) && !bo->cpu_caching); + + /* + * Display scanout is always non-coherent with the CPU cache. + * + * For Xe_LPG and beyond, PPGTT PTE lookups are also non-coherent and + * require a CPU:WC mapping. + */ + if ((!bo->cpu_caching && bo->flags & XE_BO_SCANOUT_BIT) || + (xe->info.graphics_verx100 >= 1270 && bo->flags & XE_BO_PAGETABLE)) + caching = ttm_write_combined; + + err = ttm_tt_init(&tt->ttm, &bo->ttm, page_flags, caching, extra_pages); + if (err) { + kfree(tt); + return NULL; + } + + return &tt->ttm; +} + +static int xe_ttm_tt_populate(struct ttm_device *ttm_dev, struct ttm_tt *tt, + struct ttm_operation_ctx *ctx) +{ + int err; + + /* + * dma-bufs are not populated with pages, and the dma- + * addresses are set up when moved to XE_PL_TT. + */ + if (tt->page_flags & TTM_TT_FLAG_EXTERNAL) + return 0; + + err = ttm_pool_alloc(&ttm_dev->pool, tt, ctx); + if (err) + return err; + + /* A follow up may move this xe_bo_move when BO is moved to XE_PL_TT */ + err = xe_tt_map_sg(tt); + if (err) + ttm_pool_free(&ttm_dev->pool, tt); + + return err; +} + +static void xe_ttm_tt_unpopulate(struct ttm_device *ttm_dev, struct ttm_tt *tt) +{ + struct xe_ttm_tt *xe_tt = container_of(tt, struct xe_ttm_tt, ttm); + + if (tt->page_flags & TTM_TT_FLAG_EXTERNAL) + return; + + if (xe_tt->sg) { + dma_unmap_sgtable(xe_tt->dev, xe_tt->sg, + DMA_BIDIRECTIONAL, 0); + sg_free_table(xe_tt->sg); + xe_tt->sg = NULL; + } + + return ttm_pool_free(&ttm_dev->pool, tt); +} + +static void xe_ttm_tt_destroy(struct ttm_device *ttm_dev, struct ttm_tt *tt) +{ + ttm_tt_fini(tt); + kfree(tt); +} + +static int xe_ttm_io_mem_reserve(struct ttm_device *bdev, + struct ttm_resource *mem) +{ + struct xe_device *xe = ttm_to_xe_device(bdev); + + switch (mem->mem_type) { + case XE_PL_SYSTEM: + case XE_PL_TT: + return 0; + case XE_PL_VRAM0: + case XE_PL_VRAM1: { + struct xe_ttm_vram_mgr_resource *vres = + to_xe_ttm_vram_mgr_resource(mem); + struct xe_mem_region *vram = res_to_mem_region(mem); + + if (vres->used_visible_size < mem->size) + return -EINVAL; + + mem->bus.offset = mem->start << PAGE_SHIFT; + + if (vram->mapping && + mem->placement & TTM_PL_FLAG_CONTIGUOUS) + mem->bus.addr = (u8 *)vram->mapping + + mem->bus.offset; + + mem->bus.offset += vram->io_start; + mem->bus.is_iomem = true; + +#if !defined(CONFIG_X86) + mem->bus.caching = ttm_write_combined; +#endif + return 0; + } case XE_PL_STOLEN: + return xe_ttm_stolen_io_mem_reserve(xe, mem); + default: + return -EINVAL; + } +} + +static int xe_bo_trigger_rebind(struct xe_device *xe, struct xe_bo *bo, + const struct ttm_operation_ctx *ctx) +{ + struct dma_resv_iter cursor; + struct dma_fence *fence; + struct drm_gem_object *obj = &bo->ttm.base; + struct drm_gpuvm_bo *vm_bo; + bool idle = false; + int ret = 0; + + dma_resv_assert_held(bo->ttm.base.resv); + + if (!list_empty(&bo->ttm.base.gpuva.list)) { + dma_resv_iter_begin(&cursor, bo->ttm.base.resv, + DMA_RESV_USAGE_BOOKKEEP); + dma_resv_for_each_fence_unlocked(&cursor, fence) + dma_fence_enable_sw_signaling(fence); + dma_resv_iter_end(&cursor); + } + + drm_gem_for_each_gpuvm_bo(vm_bo, obj) { + struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm); + struct drm_gpuva *gpuva; + + if (!xe_vm_in_fault_mode(vm)) { + drm_gpuvm_bo_evict(vm_bo, true); + continue; + } + + if (!idle) { + long timeout; + + if (ctx->no_wait_gpu && + !dma_resv_test_signaled(bo->ttm.base.resv, + DMA_RESV_USAGE_BOOKKEEP)) + return -EBUSY; + + timeout = dma_resv_wait_timeout(bo->ttm.base.resv, + DMA_RESV_USAGE_BOOKKEEP, + ctx->interruptible, + MAX_SCHEDULE_TIMEOUT); + if (!timeout) + return -ETIME; + if (timeout < 0) + return timeout; + + idle = true; + } + + drm_gpuvm_bo_for_each_va(gpuva, vm_bo) { + struct xe_vma *vma = gpuva_to_vma(gpuva); + + trace_xe_vma_evict(vma); + ret = xe_vm_invalidate_vma(vma); + if (XE_WARN_ON(ret)) + return ret; + } + } + + return ret; +} + +/* + * The dma-buf map_attachment() / unmap_attachment() is hooked up here. + * Note that unmapping the attachment is deferred to the next + * map_attachment time, or to bo destroy (after idling) whichever comes first. + * This is to avoid syncing before unmap_attachment(), assuming that the + * caller relies on idling the reservation object before moving the + * backing store out. Should that assumption not hold, then we will be able + * to unconditionally call unmap_attachment() when moving out to system. + */ +static int xe_bo_move_dmabuf(struct ttm_buffer_object *ttm_bo, + struct ttm_resource *new_res) +{ + struct dma_buf_attachment *attach = ttm_bo->base.import_attach; + struct xe_ttm_tt *xe_tt = container_of(ttm_bo->ttm, struct xe_ttm_tt, + ttm); + struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev); + struct sg_table *sg; + + xe_assert(xe, attach); + xe_assert(xe, ttm_bo->ttm); + + if (new_res->mem_type == XE_PL_SYSTEM) + goto out; + + if (ttm_bo->sg) { + dma_buf_unmap_attachment(attach, ttm_bo->sg, DMA_BIDIRECTIONAL); + ttm_bo->sg = NULL; + } + + sg = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL); + if (IS_ERR(sg)) + return PTR_ERR(sg); + + ttm_bo->sg = sg; + xe_tt->sg = sg; + +out: + ttm_bo_move_null(ttm_bo, new_res); + + return 0; +} + +/** + * xe_bo_move_notify - Notify subsystems of a pending move + * @bo: The buffer object + * @ctx: The struct ttm_operation_ctx controlling locking and waits. + * + * This function notifies subsystems of an upcoming buffer move. + * Upon receiving such a notification, subsystems should schedule + * halting access to the underlying pages and optionally add a fence + * to the buffer object's dma_resv object, that signals when access is + * stopped. The caller will wait on all dma_resv fences before + * starting the move. + * + * A subsystem may commence access to the object after obtaining + * bindings to the new backing memory under the object lock. + * + * Return: 0 on success, -EINTR or -ERESTARTSYS if interrupted in fault mode, + * negative error code on error. + */ +static int xe_bo_move_notify(struct xe_bo *bo, + const struct ttm_operation_ctx *ctx) +{ + struct ttm_buffer_object *ttm_bo = &bo->ttm; + struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev); + int ret; + + /* + * If this starts to call into many components, consider + * using a notification chain here. + */ + + if (xe_bo_is_pinned(bo)) + return -EINVAL; + + xe_bo_vunmap(bo); + ret = xe_bo_trigger_rebind(xe, bo, ctx); + if (ret) + return ret; + + /* Don't call move_notify() for imported dma-bufs. */ + if (ttm_bo->base.dma_buf && !ttm_bo->base.import_attach) + dma_buf_move_notify(ttm_bo->base.dma_buf); + + return 0; +} + +static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict, + struct ttm_operation_ctx *ctx, + struct ttm_resource *new_mem, + struct ttm_place *hop) +{ + struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev); + struct xe_bo *bo = ttm_to_xe_bo(ttm_bo); + struct ttm_resource *old_mem = ttm_bo->resource; + u32 old_mem_type = old_mem ? old_mem->mem_type : XE_PL_SYSTEM; + struct ttm_tt *ttm = ttm_bo->ttm; + struct xe_migrate *migrate = NULL; + struct dma_fence *fence; + bool move_lacks_source; + bool tt_has_data; + bool needs_clear; + bool handle_system_ccs = (!IS_DGFX(xe) && xe_bo_needs_ccs_pages(bo) && + ttm && ttm_tt_is_populated(ttm)) ? true : false; + int ret = 0; + /* Bo creation path, moving to system or TT. */ + if ((!old_mem && ttm) && !handle_system_ccs) { + ttm_bo_move_null(ttm_bo, new_mem); + return 0; + } + + if (ttm_bo->type == ttm_bo_type_sg) { + ret = xe_bo_move_notify(bo, ctx); + if (!ret) + ret = xe_bo_move_dmabuf(ttm_bo, new_mem); + goto out; + } + + tt_has_data = ttm && (ttm_tt_is_populated(ttm) || + (ttm->page_flags & TTM_TT_FLAG_SWAPPED)); + + move_lacks_source = handle_system_ccs ? (!bo->ccs_cleared) : + (!mem_type_is_vram(old_mem_type) && !tt_has_data); + + needs_clear = (ttm && ttm->page_flags & TTM_TT_FLAG_ZERO_ALLOC) || + (!ttm && ttm_bo->type == ttm_bo_type_device); + + if ((move_lacks_source && !needs_clear)) { + ttm_bo_move_null(ttm_bo, new_mem); + goto out; + } + + if (old_mem_type == XE_PL_SYSTEM && new_mem->mem_type == XE_PL_TT && !handle_system_ccs) { + ttm_bo_move_null(ttm_bo, new_mem); + goto out; + } + + /* + * Failed multi-hop where the old_mem is still marked as + * TTM_PL_FLAG_TEMPORARY, should just be a dummy move. + */ + if (old_mem_type == XE_PL_TT && + new_mem->mem_type == XE_PL_TT) { + ttm_bo_move_null(ttm_bo, new_mem); + goto out; + } + + if (!move_lacks_source && !xe_bo_is_pinned(bo)) { + ret = xe_bo_move_notify(bo, ctx); + if (ret) + goto out; + } + + if (old_mem_type == XE_PL_TT && + new_mem->mem_type == XE_PL_SYSTEM) { + long timeout = dma_resv_wait_timeout(ttm_bo->base.resv, + DMA_RESV_USAGE_BOOKKEEP, + true, + MAX_SCHEDULE_TIMEOUT); + if (timeout < 0) { + ret = timeout; + goto out; + } + + if (!handle_system_ccs) { + ttm_bo_move_null(ttm_bo, new_mem); + goto out; + } + } + + if (!move_lacks_source && + ((old_mem_type == XE_PL_SYSTEM && resource_is_vram(new_mem)) || + (mem_type_is_vram(old_mem_type) && + new_mem->mem_type == XE_PL_SYSTEM))) { + hop->fpfn = 0; + hop->lpfn = 0; + hop->mem_type = XE_PL_TT; + hop->flags = TTM_PL_FLAG_TEMPORARY; + ret = -EMULTIHOP; + goto out; + } + + if (bo->tile) + migrate = bo->tile->migrate; + else if (resource_is_vram(new_mem)) + migrate = mem_type_to_migrate(xe, new_mem->mem_type); + else if (mem_type_is_vram(old_mem_type)) + migrate = mem_type_to_migrate(xe, old_mem_type); + else + migrate = xe->tiles[0].migrate; + + xe_assert(xe, migrate); + + trace_xe_bo_move(bo); + xe_device_mem_access_get(xe); + + if (xe_bo_is_pinned(bo) && !xe_bo_is_user(bo)) { + /* + * Kernel memory that is pinned should only be moved on suspend + * / resume, some of the pinned memory is required for the + * device to resume / use the GPU to move other evicted memory + * (user memory) around. This likely could be optimized a bit + * futher where we find the minimum set of pinned memory + * required for resume but for simplity doing a memcpy for all + * pinned memory. + */ + ret = xe_bo_vmap(bo); + if (!ret) { + ret = ttm_bo_move_memcpy(ttm_bo, ctx, new_mem); + + /* Create a new VMAP once kernel BO back in VRAM */ + if (!ret && resource_is_vram(new_mem)) { + struct xe_mem_region *vram = res_to_mem_region(new_mem); + void *new_addr = vram->mapping + + (new_mem->start << PAGE_SHIFT); + + if (XE_WARN_ON(new_mem->start == XE_BO_INVALID_OFFSET)) { + ret = -EINVAL; + xe_device_mem_access_put(xe); + goto out; + } + + xe_assert(xe, new_mem->start == + bo->placements->fpfn); + + iosys_map_set_vaddr_iomem(&bo->vmap, new_addr); + } + } + } else { + if (move_lacks_source) + fence = xe_migrate_clear(migrate, bo, new_mem); + else + fence = xe_migrate_copy(migrate, bo, bo, old_mem, + new_mem, handle_system_ccs); + if (IS_ERR(fence)) { + ret = PTR_ERR(fence); + xe_device_mem_access_put(xe); + goto out; + } + if (!move_lacks_source) { + ret = ttm_bo_move_accel_cleanup(ttm_bo, fence, evict, + true, new_mem); + if (ret) { + dma_fence_wait(fence, false); + ttm_bo_move_null(ttm_bo, new_mem); + ret = 0; + } + } else { + /* + * ttm_bo_move_accel_cleanup() may blow up if + * bo->resource == NULL, so just attach the + * fence and set the new resource. + */ + dma_resv_add_fence(ttm_bo->base.resv, fence, + DMA_RESV_USAGE_KERNEL); + ttm_bo_move_null(ttm_bo, new_mem); + } + + dma_fence_put(fence); + } + + xe_device_mem_access_put(xe); + +out: + return ret; + +} + +/** + * xe_bo_evict_pinned() - Evict a pinned VRAM object to system memory + * @bo: The buffer object to move. + * + * On successful completion, the object memory will be moved to sytem memory. + * This function blocks until the object has been fully moved. + * + * This is needed to for special handling of pinned VRAM object during + * suspend-resume. + * + * Return: 0 on success. Negative error code on failure. + */ +int xe_bo_evict_pinned(struct xe_bo *bo) +{ + struct ttm_place place = { + .mem_type = XE_PL_TT, + }; + struct ttm_placement placement = { + .placement = &place, + .num_placement = 1, + }; + struct ttm_operation_ctx ctx = { + .interruptible = false, + }; + struct ttm_resource *new_mem; + int ret; + + xe_bo_assert_held(bo); + + if (WARN_ON(!bo->ttm.resource)) + return -EINVAL; + + if (WARN_ON(!xe_bo_is_pinned(bo))) + return -EINVAL; + + if (WARN_ON(!xe_bo_is_vram(bo))) + return -EINVAL; + + ret = ttm_bo_mem_space(&bo->ttm, &placement, &new_mem, &ctx); + if (ret) + return ret; + + if (!bo->ttm.ttm) { + bo->ttm.ttm = xe_ttm_tt_create(&bo->ttm, 0); + if (!bo->ttm.ttm) { + ret = -ENOMEM; + goto err_res_free; + } + } + + ret = ttm_tt_populate(bo->ttm.bdev, bo->ttm.ttm, &ctx); + if (ret) + goto err_res_free; + + ret = dma_resv_reserve_fences(bo->ttm.base.resv, 1); + if (ret) + goto err_res_free; + + ret = xe_bo_move(&bo->ttm, false, &ctx, new_mem, NULL); + if (ret) + goto err_res_free; + + dma_resv_wait_timeout(bo->ttm.base.resv, DMA_RESV_USAGE_KERNEL, + false, MAX_SCHEDULE_TIMEOUT); + + return 0; + +err_res_free: + ttm_resource_free(&bo->ttm, &new_mem); + return ret; +} + +/** + * xe_bo_restore_pinned() - Restore a pinned VRAM object + * @bo: The buffer object to move. + * + * On successful completion, the object memory will be moved back to VRAM. + * This function blocks until the object has been fully moved. + * + * This is needed to for special handling of pinned VRAM object during + * suspend-resume. + * + * Return: 0 on success. Negative error code on failure. + */ +int xe_bo_restore_pinned(struct xe_bo *bo) +{ + struct ttm_operation_ctx ctx = { + .interruptible = false, + }; + struct ttm_resource *new_mem; + int ret; + + xe_bo_assert_held(bo); + + if (WARN_ON(!bo->ttm.resource)) + return -EINVAL; + + if (WARN_ON(!xe_bo_is_pinned(bo))) + return -EINVAL; + + if (WARN_ON(xe_bo_is_vram(bo) || !bo->ttm.ttm)) + return -EINVAL; + + ret = ttm_bo_mem_space(&bo->ttm, &bo->placement, &new_mem, &ctx); + if (ret) + return ret; + + ret = ttm_tt_populate(bo->ttm.bdev, bo->ttm.ttm, &ctx); + if (ret) + goto err_res_free; + + ret = dma_resv_reserve_fences(bo->ttm.base.resv, 1); + if (ret) + goto err_res_free; + + ret = xe_bo_move(&bo->ttm, false, &ctx, new_mem, NULL); + if (ret) + goto err_res_free; + + dma_resv_wait_timeout(bo->ttm.base.resv, DMA_RESV_USAGE_KERNEL, + false, MAX_SCHEDULE_TIMEOUT); + + return 0; + +err_res_free: + ttm_resource_free(&bo->ttm, &new_mem); + return ret; +} + +static unsigned long xe_ttm_io_mem_pfn(struct ttm_buffer_object *ttm_bo, + unsigned long page_offset) +{ + struct xe_bo *bo = ttm_to_xe_bo(ttm_bo); + struct xe_res_cursor cursor; + struct xe_mem_region *vram; + + if (ttm_bo->resource->mem_type == XE_PL_STOLEN) + return xe_ttm_stolen_io_offset(bo, page_offset << PAGE_SHIFT) >> PAGE_SHIFT; + + vram = res_to_mem_region(ttm_bo->resource); + xe_res_first(ttm_bo->resource, (u64)page_offset << PAGE_SHIFT, 0, &cursor); + return (vram->io_start + cursor.start) >> PAGE_SHIFT; +} + +static void __xe_bo_vunmap(struct xe_bo *bo); + +/* + * TODO: Move this function to TTM so we don't rely on how TTM does its + * locking, thereby abusing TTM internals. + */ +static bool xe_ttm_bo_lock_in_destructor(struct ttm_buffer_object *ttm_bo) +{ + struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev); + bool locked; + + xe_assert(xe, !kref_read(&ttm_bo->kref)); + + /* + * We can typically only race with TTM trylocking under the + * lru_lock, which will immediately be unlocked again since + * the ttm_bo refcount is zero at this point. So trylocking *should* + * always succeed here, as long as we hold the lru lock. + */ + spin_lock(&ttm_bo->bdev->lru_lock); + locked = dma_resv_trylock(ttm_bo->base.resv); + spin_unlock(&ttm_bo->bdev->lru_lock); + xe_assert(xe, locked); + + return locked; +} + +static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) +{ + struct dma_resv_iter cursor; + struct dma_fence *fence; + struct dma_fence *replacement = NULL; + struct xe_bo *bo; + + if (!xe_bo_is_xe_bo(ttm_bo)) + return; + + bo = ttm_to_xe_bo(ttm_bo); + xe_assert(xe_bo_device(bo), !(bo->created && kref_read(&ttm_bo->base.refcount))); + + /* + * Corner case where TTM fails to allocate memory and this BOs resv + * still points the VMs resv + */ + if (ttm_bo->base.resv != &ttm_bo->base._resv) + return; + + if (!xe_ttm_bo_lock_in_destructor(ttm_bo)) + return; + + /* + * Scrub the preempt fences if any. The unbind fence is already + * attached to the resv. + * TODO: Don't do this for external bos once we scrub them after + * unbind. + */ + dma_resv_for_each_fence(&cursor, ttm_bo->base.resv, + DMA_RESV_USAGE_BOOKKEEP, fence) { + if (xe_fence_is_xe_preempt(fence) && + !dma_fence_is_signaled(fence)) { + if (!replacement) + replacement = dma_fence_get_stub(); + + dma_resv_replace_fences(ttm_bo->base.resv, + fence->context, + replacement, + DMA_RESV_USAGE_BOOKKEEP); + } + } + dma_fence_put(replacement); + + dma_resv_unlock(ttm_bo->base.resv); +} + +static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo) +{ + if (!xe_bo_is_xe_bo(ttm_bo)) + return; + + /* + * Object is idle and about to be destroyed. Release the + * dma-buf attachment. + */ + if (ttm_bo->type == ttm_bo_type_sg && ttm_bo->sg) { + struct xe_ttm_tt *xe_tt = container_of(ttm_bo->ttm, + struct xe_ttm_tt, ttm); + + dma_buf_unmap_attachment(ttm_bo->base.import_attach, ttm_bo->sg, + DMA_BIDIRECTIONAL); + ttm_bo->sg = NULL; + xe_tt->sg = NULL; + } +} + +struct ttm_device_funcs xe_ttm_funcs = { + .ttm_tt_create = xe_ttm_tt_create, + .ttm_tt_populate = xe_ttm_tt_populate, + .ttm_tt_unpopulate = xe_ttm_tt_unpopulate, + .ttm_tt_destroy = xe_ttm_tt_destroy, + .evict_flags = xe_evict_flags, + .move = xe_bo_move, + .io_mem_reserve = xe_ttm_io_mem_reserve, + .io_mem_pfn = xe_ttm_io_mem_pfn, + .release_notify = xe_ttm_bo_release_notify, + .eviction_valuable = ttm_bo_eviction_valuable, + .delete_mem_notify = xe_ttm_bo_delete_mem_notify, +}; + +static void xe_ttm_bo_destroy(struct ttm_buffer_object *ttm_bo) +{ + struct xe_bo *bo = ttm_to_xe_bo(ttm_bo); + struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev); + + if (bo->ttm.base.import_attach) + drm_prime_gem_destroy(&bo->ttm.base, NULL); + drm_gem_object_release(&bo->ttm.base); + + xe_assert(xe, list_empty(&ttm_bo->base.gpuva.list)); + + if (bo->ggtt_node.size) + xe_ggtt_remove_bo(bo->tile->mem.ggtt, bo); + +#ifdef CONFIG_PROC_FS + if (bo->client) + xe_drm_client_remove_bo(bo); +#endif + + if (bo->vm && xe_bo_is_user(bo)) + xe_vm_put(bo->vm); + + kfree(bo); +} + +static void xe_gem_object_free(struct drm_gem_object *obj) +{ + /* Our BO reference counting scheme works as follows: + * + * The gem object kref is typically used throughout the driver, + * and the gem object holds a ttm_buffer_object refcount, so + * that when the last gem object reference is put, which is when + * we end up in this function, we put also that ttm_buffer_object + * refcount. Anything using gem interfaces is then no longer + * allowed to access the object in a way that requires a gem + * refcount, including locking the object. + * + * driver ttm callbacks is allowed to use the ttm_buffer_object + * refcount directly if needed. + */ + __xe_bo_vunmap(gem_to_xe_bo(obj)); + ttm_bo_put(container_of(obj, struct ttm_buffer_object, base)); +} + +static void xe_gem_object_close(struct drm_gem_object *obj, + struct drm_file *file_priv) +{ + struct xe_bo *bo = gem_to_xe_bo(obj); + + if (bo->vm && !xe_vm_in_fault_mode(bo->vm)) { + xe_assert(xe_bo_device(bo), xe_bo_is_user(bo)); + + xe_bo_lock(bo, false); + ttm_bo_set_bulk_move(&bo->ttm, NULL); + xe_bo_unlock(bo); + } +} + +static bool should_migrate_to_system(struct xe_bo *bo) +{ + struct xe_device *xe = xe_bo_device(bo); + + return xe_device_in_fault_mode(xe) && bo->props.cpu_atomic; +} + +static vm_fault_t xe_gem_fault(struct vm_fault *vmf) +{ + struct ttm_buffer_object *tbo = vmf->vma->vm_private_data; + struct drm_device *ddev = tbo->base.dev; + vm_fault_t ret; + int idx, r = 0; + + ret = ttm_bo_vm_reserve(tbo, vmf); + if (ret) + return ret; + + if (drm_dev_enter(ddev, &idx)) { + struct xe_bo *bo = ttm_to_xe_bo(tbo); + + trace_xe_bo_cpu_fault(bo); + + if (should_migrate_to_system(bo)) { + r = xe_bo_migrate(bo, XE_PL_TT); + if (r == -EBUSY || r == -ERESTARTSYS || r == -EINTR) + ret = VM_FAULT_NOPAGE; + else if (r) + ret = VM_FAULT_SIGBUS; + } + if (!ret) + ret = ttm_bo_vm_fault_reserved(vmf, + vmf->vma->vm_page_prot, + TTM_BO_VM_NUM_PREFAULT); + drm_dev_exit(idx); + } else { + ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot); + } + if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) + return ret; + + dma_resv_unlock(tbo->base.resv); + return ret; +} + +static const struct vm_operations_struct xe_gem_vm_ops = { + .fault = xe_gem_fault, + .open = ttm_bo_vm_open, + .close = ttm_bo_vm_close, + .access = ttm_bo_vm_access +}; + +static const struct drm_gem_object_funcs xe_gem_object_funcs = { + .free = xe_gem_object_free, + .close = xe_gem_object_close, + .mmap = drm_gem_ttm_mmap, + .export = xe_gem_prime_export, + .vm_ops = &xe_gem_vm_ops, +}; + +/** + * xe_bo_alloc - Allocate storage for a struct xe_bo + * + * This funcition is intended to allocate storage to be used for input + * to __xe_bo_create_locked(), in the case a pointer to the bo to be + * created is needed before the call to __xe_bo_create_locked(). + * If __xe_bo_create_locked ends up never to be called, then the + * storage allocated with this function needs to be freed using + * xe_bo_free(). + * + * Return: A pointer to an uninitialized struct xe_bo on success, + * ERR_PTR(-ENOMEM) on error. + */ +struct xe_bo *xe_bo_alloc(void) +{ + struct xe_bo *bo = kzalloc(sizeof(*bo), GFP_KERNEL); + + if (!bo) + return ERR_PTR(-ENOMEM); + + return bo; +} + +/** + * xe_bo_free - Free storage allocated using xe_bo_alloc() + * @bo: The buffer object storage. + * + * Refer to xe_bo_alloc() documentation for valid use-cases. + */ +void xe_bo_free(struct xe_bo *bo) +{ + kfree(bo); +} + +struct xe_bo *___xe_bo_create_locked(struct xe_device *xe, struct xe_bo *bo, + struct xe_tile *tile, struct dma_resv *resv, + struct ttm_lru_bulk_move *bulk, size_t size, + u16 cpu_caching, enum ttm_bo_type type, + u32 flags) +{ + struct ttm_operation_ctx ctx = { + .interruptible = true, + .no_wait_gpu = false, + }; + struct ttm_placement *placement; + uint32_t alignment; + size_t aligned_size; + int err; + + /* Only kernel objects should set GT */ + xe_assert(xe, !tile || type == ttm_bo_type_kernel); + + if (XE_WARN_ON(!size)) { + xe_bo_free(bo); + return ERR_PTR(-EINVAL); + } + + if (flags & (XE_BO_CREATE_VRAM_MASK | XE_BO_CREATE_STOLEN_BIT) && + !(flags & XE_BO_CREATE_IGNORE_MIN_PAGE_SIZE_BIT) && + xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K) { + aligned_size = ALIGN(size, SZ_64K); + if (type != ttm_bo_type_device) + size = ALIGN(size, SZ_64K); + flags |= XE_BO_INTERNAL_64K; + alignment = SZ_64K >> PAGE_SHIFT; + + } else { + aligned_size = ALIGN(size, SZ_4K); + flags &= ~XE_BO_INTERNAL_64K; + alignment = SZ_4K >> PAGE_SHIFT; + } + + if (type == ttm_bo_type_device && aligned_size != size) + return ERR_PTR(-EINVAL); + + if (!bo) { + bo = xe_bo_alloc(); + if (IS_ERR(bo)) + return bo; + } + + bo->ccs_cleared = false; + bo->tile = tile; + bo->size = size; + bo->flags = flags; + bo->cpu_caching = cpu_caching; + bo->ttm.base.funcs = &xe_gem_object_funcs; + bo->props.preferred_mem_class = XE_BO_PROPS_INVALID; + bo->props.preferred_gt = XE_BO_PROPS_INVALID; + bo->props.preferred_mem_type = XE_BO_PROPS_INVALID; + bo->ttm.priority = XE_BO_PRIORITY_NORMAL; + INIT_LIST_HEAD(&bo->pinned_link); +#ifdef CONFIG_PROC_FS + INIT_LIST_HEAD(&bo->client_link); +#endif + + drm_gem_private_object_init(&xe->drm, &bo->ttm.base, size); + + if (resv) { + ctx.allow_res_evict = !(flags & XE_BO_CREATE_NO_RESV_EVICT); + ctx.resv = resv; + } + + if (!(flags & XE_BO_FIXED_PLACEMENT_BIT)) { + err = __xe_bo_placement_for_flags(xe, bo, bo->flags); + if (WARN_ON(err)) { + xe_ttm_bo_destroy(&bo->ttm); + return ERR_PTR(err); + } + } + + /* Defer populating type_sg bos */ + placement = (type == ttm_bo_type_sg || + bo->flags & XE_BO_DEFER_BACKING) ? &sys_placement : + &bo->placement; + err = ttm_bo_init_reserved(&xe->ttm, &bo->ttm, type, + placement, alignment, + &ctx, NULL, resv, xe_ttm_bo_destroy); + if (err) + return ERR_PTR(err); + + /* + * The VRAM pages underneath are potentially still being accessed by the + * GPU, as per async GPU clearing and async evictions. However TTM makes + * sure to add any corresponding move/clear fences into the objects + * dma-resv using the DMA_RESV_USAGE_KERNEL slot. + * + * For KMD internal buffers we don't care about GPU clearing, however we + * still need to handle async evictions, where the VRAM is still being + * accessed by the GPU. Most internal callers are not expecting this, + * since they are missing the required synchronisation before accessing + * the memory. To keep things simple just sync wait any kernel fences + * here, if the buffer is designated KMD internal. + * + * For normal userspace objects we should already have the required + * pipelining or sync waiting elsewhere, since we already have to deal + * with things like async GPU clearing. + */ + if (type == ttm_bo_type_kernel) { + long timeout = dma_resv_wait_timeout(bo->ttm.base.resv, + DMA_RESV_USAGE_KERNEL, + ctx.interruptible, + MAX_SCHEDULE_TIMEOUT); + + if (timeout < 0) { + if (!resv) + dma_resv_unlock(bo->ttm.base.resv); + xe_bo_put(bo); + return ERR_PTR(timeout); + } + } + + bo->created = true; + if (bulk) + ttm_bo_set_bulk_move(&bo->ttm, bulk); + else + ttm_bo_move_to_lru_tail_unlocked(&bo->ttm); + + return bo; +} + +static int __xe_bo_fixed_placement(struct xe_device *xe, + struct xe_bo *bo, + u32 flags, + u64 start, u64 end, u64 size) +{ + struct ttm_place *place = bo->placements; + + if (flags & (XE_BO_CREATE_USER_BIT|XE_BO_CREATE_SYSTEM_BIT)) + return -EINVAL; + + place->flags = TTM_PL_FLAG_CONTIGUOUS; + place->fpfn = start >> PAGE_SHIFT; + place->lpfn = end >> PAGE_SHIFT; + + switch (flags & (XE_BO_CREATE_STOLEN_BIT | XE_BO_CREATE_VRAM_MASK)) { + case XE_BO_CREATE_VRAM0_BIT: + place->mem_type = XE_PL_VRAM0; + break; + case XE_BO_CREATE_VRAM1_BIT: + place->mem_type = XE_PL_VRAM1; + break; + case XE_BO_CREATE_STOLEN_BIT: + place->mem_type = XE_PL_STOLEN; + break; + + default: + /* 0 or multiple of the above set */ + return -EINVAL; + } + + bo->placement = (struct ttm_placement) { + .num_placement = 1, + .placement = place, + .num_busy_placement = 1, + .busy_placement = place, + }; + + return 0; +} + +static struct xe_bo * +__xe_bo_create_locked(struct xe_device *xe, + struct xe_tile *tile, struct xe_vm *vm, + size_t size, u64 start, u64 end, + u16 cpu_caching, enum ttm_bo_type type, u32 flags) +{ + struct xe_bo *bo = NULL; + int err; + + if (vm) + xe_vm_assert_held(vm); + + if (start || end != ~0ULL) { + bo = xe_bo_alloc(); + if (IS_ERR(bo)) + return bo; + + flags |= XE_BO_FIXED_PLACEMENT_BIT; + err = __xe_bo_fixed_placement(xe, bo, flags, start, end, size); + if (err) { + xe_bo_free(bo); + return ERR_PTR(err); + } + } + + bo = ___xe_bo_create_locked(xe, bo, tile, vm ? xe_vm_resv(vm) : NULL, + vm && !xe_vm_in_fault_mode(vm) && + flags & XE_BO_CREATE_USER_BIT ? + &vm->lru_bulk_move : NULL, size, + cpu_caching, type, flags); + if (IS_ERR(bo)) + return bo; + + /* + * Note that instead of taking a reference no the drm_gpuvm_resv_bo(), + * to ensure the shared resv doesn't disappear under the bo, the bo + * will keep a reference to the vm, and avoid circular references + * by having all the vm's bo refereferences released at vm close + * time. + */ + if (vm && xe_bo_is_user(bo)) + xe_vm_get(vm); + bo->vm = vm; + + if (bo->flags & XE_BO_CREATE_GGTT_BIT) { + if (!tile && flags & XE_BO_CREATE_STOLEN_BIT) + tile = xe_device_get_root_tile(xe); + + xe_assert(xe, tile); + + if (flags & XE_BO_FIXED_PLACEMENT_BIT) { + err = xe_ggtt_insert_bo_at(tile->mem.ggtt, bo, + start + bo->size, U64_MAX); + } else { + err = xe_ggtt_insert_bo(tile->mem.ggtt, bo); + } + if (err) + goto err_unlock_put_bo; + } + + return bo; + +err_unlock_put_bo: + __xe_bo_unset_bulk_move(bo); + xe_bo_unlock_vm_held(bo); + xe_bo_put(bo); + return ERR_PTR(err); +} + +struct xe_bo * +xe_bo_create_locked_range(struct xe_device *xe, + struct xe_tile *tile, struct xe_vm *vm, + size_t size, u64 start, u64 end, + enum ttm_bo_type type, u32 flags) +{ + return __xe_bo_create_locked(xe, tile, vm, size, start, end, 0, type, flags); +} + +struct xe_bo *xe_bo_create_locked(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, size_t size, + enum ttm_bo_type type, u32 flags) +{ + return __xe_bo_create_locked(xe, tile, vm, size, 0, ~0ULL, 0, type, flags); +} + +struct xe_bo *xe_bo_create_user(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, size_t size, + u16 cpu_caching, + enum ttm_bo_type type, + u32 flags) +{ + struct xe_bo *bo = __xe_bo_create_locked(xe, tile, vm, size, 0, ~0ULL, + cpu_caching, type, + flags | XE_BO_CREATE_USER_BIT); + if (!IS_ERR(bo)) + xe_bo_unlock_vm_held(bo); + + return bo; +} + +struct xe_bo *xe_bo_create(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, size_t size, + enum ttm_bo_type type, u32 flags) +{ + struct xe_bo *bo = xe_bo_create_locked(xe, tile, vm, size, type, flags); + + if (!IS_ERR(bo)) + xe_bo_unlock_vm_held(bo); + + return bo; +} + +struct xe_bo *xe_bo_create_pin_map_at(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, + size_t size, u64 offset, + enum ttm_bo_type type, u32 flags) +{ + struct xe_bo *bo; + int err; + u64 start = offset == ~0ull ? 0 : offset; + u64 end = offset == ~0ull ? offset : start + size; + + if (flags & XE_BO_CREATE_STOLEN_BIT && + xe_ttm_stolen_cpu_access_needs_ggtt(xe)) + flags |= XE_BO_CREATE_GGTT_BIT; + + bo = xe_bo_create_locked_range(xe, tile, vm, size, start, end, type, + flags | XE_BO_NEEDS_CPU_ACCESS); + if (IS_ERR(bo)) + return bo; + + err = xe_bo_pin(bo); + if (err) + goto err_put; + + err = xe_bo_vmap(bo); + if (err) + goto err_unpin; + + xe_bo_unlock_vm_held(bo); + + return bo; + +err_unpin: + xe_bo_unpin(bo); +err_put: + xe_bo_unlock_vm_held(bo); + xe_bo_put(bo); + return ERR_PTR(err); +} + +struct xe_bo *xe_bo_create_pin_map(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, size_t size, + enum ttm_bo_type type, u32 flags) +{ + return xe_bo_create_pin_map_at(xe, tile, vm, size, ~0ull, type, flags); +} + +struct xe_bo *xe_bo_create_from_data(struct xe_device *xe, struct xe_tile *tile, + const void *data, size_t size, + enum ttm_bo_type type, u32 flags) +{ + struct xe_bo *bo = xe_bo_create_pin_map(xe, tile, NULL, + ALIGN(size, PAGE_SIZE), + type, flags); + if (IS_ERR(bo)) + return bo; + + xe_map_memcpy_to(xe, &bo->vmap, 0, data, size); + + return bo; +} + +static void __xe_bo_unpin_map_no_vm(struct drm_device *drm, void *arg) +{ + xe_bo_unpin_map_no_vm(arg); +} + +struct xe_bo *xe_managed_bo_create_pin_map(struct xe_device *xe, struct xe_tile *tile, + size_t size, u32 flags) +{ + struct xe_bo *bo; + int ret; + + bo = xe_bo_create_pin_map(xe, tile, NULL, size, ttm_bo_type_kernel, flags); + if (IS_ERR(bo)) + return bo; + + ret = drmm_add_action_or_reset(&xe->drm, __xe_bo_unpin_map_no_vm, bo); + if (ret) + return ERR_PTR(ret); + + return bo; +} + +struct xe_bo *xe_managed_bo_create_from_data(struct xe_device *xe, struct xe_tile *tile, + const void *data, size_t size, u32 flags) +{ + struct xe_bo *bo = xe_managed_bo_create_pin_map(xe, tile, ALIGN(size, PAGE_SIZE), flags); + + if (IS_ERR(bo)) + return bo; + + xe_map_memcpy_to(xe, &bo->vmap, 0, data, size); + + return bo; +} + +/* + * XXX: This is in the VM bind data path, likely should calculate this once and + * store, with a recalculation if the BO is moved. + */ +uint64_t vram_region_gpu_offset(struct ttm_resource *res) +{ + struct xe_device *xe = ttm_to_xe_device(res->bo->bdev); + + if (res->mem_type == XE_PL_STOLEN) + return xe_ttm_stolen_gpu_offset(xe); + + return res_to_mem_region(res)->dpa_base; +} + +/** + * xe_bo_pin_external - pin an external BO + * @bo: buffer object to be pinned + * + * Pin an external (not tied to a VM, can be exported via dma-buf / prime FD) + * BO. Unique call compared to xe_bo_pin as this function has it own set of + * asserts and code to ensure evict / restore on suspend / resume. + * + * Returns 0 for success, negative error code otherwise. + */ +int xe_bo_pin_external(struct xe_bo *bo) +{ + struct xe_device *xe = xe_bo_device(bo); + int err; + + xe_assert(xe, !bo->vm); + xe_assert(xe, xe_bo_is_user(bo)); + + if (!xe_bo_is_pinned(bo)) { + err = xe_bo_validate(bo, NULL, false); + if (err) + return err; + + if (xe_bo_is_vram(bo)) { + spin_lock(&xe->pinned.lock); + list_add_tail(&bo->pinned_link, + &xe->pinned.external_vram); + spin_unlock(&xe->pinned.lock); + } + } + + ttm_bo_pin(&bo->ttm); + + /* + * FIXME: If we always use the reserve / unreserve functions for locking + * we do not need this. + */ + ttm_bo_move_to_lru_tail_unlocked(&bo->ttm); + + return 0; +} + +int xe_bo_pin(struct xe_bo *bo) +{ + struct xe_device *xe = xe_bo_device(bo); + int err; + + /* We currently don't expect user BO to be pinned */ + xe_assert(xe, !xe_bo_is_user(bo)); + + /* Pinned object must be in GGTT or have pinned flag */ + xe_assert(xe, bo->flags & (XE_BO_CREATE_PINNED_BIT | + XE_BO_CREATE_GGTT_BIT)); + + /* + * No reason we can't support pinning imported dma-bufs we just don't + * expect to pin an imported dma-buf. + */ + xe_assert(xe, !bo->ttm.base.import_attach); + + /* We only expect at most 1 pin */ + xe_assert(xe, !xe_bo_is_pinned(bo)); + + err = xe_bo_validate(bo, NULL, false); + if (err) + return err; + + /* + * For pinned objects in on DGFX, which are also in vram, we expect + * these to be in contiguous VRAM memory. Required eviction / restore + * during suspend / resume (force restore to same physical address). + */ + if (IS_DGFX(xe) && !(IS_ENABLED(CONFIG_DRM_XE_DEBUG) && + bo->flags & XE_BO_INTERNAL_TEST)) { + struct ttm_place *place = &(bo->placements[0]); + + if (mem_type_is_vram(place->mem_type)) { + xe_assert(xe, place->flags & TTM_PL_FLAG_CONTIGUOUS); + + place->fpfn = (xe_bo_addr(bo, 0, PAGE_SIZE) - + vram_region_gpu_offset(bo->ttm.resource)) >> PAGE_SHIFT; + place->lpfn = place->fpfn + (bo->size >> PAGE_SHIFT); + + spin_lock(&xe->pinned.lock); + list_add_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present); + spin_unlock(&xe->pinned.lock); + } + } + + ttm_bo_pin(&bo->ttm); + + /* + * FIXME: If we always use the reserve / unreserve functions for locking + * we do not need this. + */ + ttm_bo_move_to_lru_tail_unlocked(&bo->ttm); + + return 0; +} + +/** + * xe_bo_unpin_external - unpin an external BO + * @bo: buffer object to be unpinned + * + * Unpin an external (not tied to a VM, can be exported via dma-buf / prime FD) + * BO. Unique call compared to xe_bo_unpin as this function has it own set of + * asserts and code to ensure evict / restore on suspend / resume. + * + * Returns 0 for success, negative error code otherwise. + */ +void xe_bo_unpin_external(struct xe_bo *bo) +{ + struct xe_device *xe = xe_bo_device(bo); + + xe_assert(xe, !bo->vm); + xe_assert(xe, xe_bo_is_pinned(bo)); + xe_assert(xe, xe_bo_is_user(bo)); + + if (bo->ttm.pin_count == 1 && !list_empty(&bo->pinned_link)) { + spin_lock(&xe->pinned.lock); + list_del_init(&bo->pinned_link); + spin_unlock(&xe->pinned.lock); + } + + ttm_bo_unpin(&bo->ttm); + + /* + * FIXME: If we always use the reserve / unreserve functions for locking + * we do not need this. + */ + ttm_bo_move_to_lru_tail_unlocked(&bo->ttm); +} + +void xe_bo_unpin(struct xe_bo *bo) +{ + struct xe_device *xe = xe_bo_device(bo); + + xe_assert(xe, !bo->ttm.base.import_attach); + xe_assert(xe, xe_bo_is_pinned(bo)); + + if (IS_DGFX(xe) && !(IS_ENABLED(CONFIG_DRM_XE_DEBUG) && + bo->flags & XE_BO_INTERNAL_TEST)) { + struct ttm_place *place = &(bo->placements[0]); + + if (mem_type_is_vram(place->mem_type)) { + xe_assert(xe, !list_empty(&bo->pinned_link)); + + spin_lock(&xe->pinned.lock); + list_del_init(&bo->pinned_link); + spin_unlock(&xe->pinned.lock); + } + } + + ttm_bo_unpin(&bo->ttm); +} + +/** + * xe_bo_validate() - Make sure the bo is in an allowed placement + * @bo: The bo, + * @vm: Pointer to a the vm the bo shares a locked dma_resv object with, or + * NULL. Used together with @allow_res_evict. + * @allow_res_evict: Whether it's allowed to evict bos sharing @vm's + * reservation object. + * + * Make sure the bo is in allowed placement, migrating it if necessary. If + * needed, other bos will be evicted. If bos selected for eviction shares + * the @vm's reservation object, they can be evicted iff @allow_res_evict is + * set to true, otherwise they will be bypassed. + * + * Return: 0 on success, negative error code on failure. May return + * -EINTR or -ERESTARTSYS if internal waits are interrupted by a signal. + */ +int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict) +{ + struct ttm_operation_ctx ctx = { + .interruptible = true, + .no_wait_gpu = false, + }; + + if (vm) { + lockdep_assert_held(&vm->lock); + xe_vm_assert_held(vm); + + ctx.allow_res_evict = allow_res_evict; + ctx.resv = xe_vm_resv(vm); + } + + return ttm_bo_validate(&bo->ttm, &bo->placement, &ctx); +} + +bool xe_bo_is_xe_bo(struct ttm_buffer_object *bo) +{ + if (bo->destroy == &xe_ttm_bo_destroy) + return true; + + return false; +} + +/* + * Resolve a BO address. There is no assert to check if the proper lock is held + * so it should only be used in cases where it is not fatal to get the wrong + * address, such as printing debug information, but not in cases where memory is + * written based on this result. + */ +dma_addr_t __xe_bo_addr(struct xe_bo *bo, u64 offset, size_t page_size) +{ + struct xe_device *xe = xe_bo_device(bo); + struct xe_res_cursor cur; + u64 page; + + xe_assert(xe, page_size <= PAGE_SIZE); + page = offset >> PAGE_SHIFT; + offset &= (PAGE_SIZE - 1); + + if (!xe_bo_is_vram(bo) && !xe_bo_is_stolen(bo)) { + xe_assert(xe, bo->ttm.ttm); + + xe_res_first_sg(xe_bo_sg(bo), page << PAGE_SHIFT, + page_size, &cur); + return xe_res_dma(&cur) + offset; + } else { + struct xe_res_cursor cur; + + xe_res_first(bo->ttm.resource, page << PAGE_SHIFT, + page_size, &cur); + return cur.start + offset + vram_region_gpu_offset(bo->ttm.resource); + } +} + +dma_addr_t xe_bo_addr(struct xe_bo *bo, u64 offset, size_t page_size) +{ + if (!READ_ONCE(bo->ttm.pin_count)) + xe_bo_assert_held(bo); + return __xe_bo_addr(bo, offset, page_size); +} + +int xe_bo_vmap(struct xe_bo *bo) +{ + void *virtual; + bool is_iomem; + int ret; + + xe_bo_assert_held(bo); + + if (!(bo->flags & XE_BO_NEEDS_CPU_ACCESS)) + return -EINVAL; + + if (!iosys_map_is_null(&bo->vmap)) + return 0; + + /* + * We use this more or less deprecated interface for now since + * ttm_bo_vmap() doesn't offer the optimization of kmapping + * single page bos, which is done here. + * TODO: Fix up ttm_bo_vmap to do that, or fix up ttm_bo_kmap + * to use struct iosys_map. + */ + ret = ttm_bo_kmap(&bo->ttm, 0, bo->size >> PAGE_SHIFT, &bo->kmap); + if (ret) + return ret; + + virtual = ttm_kmap_obj_virtual(&bo->kmap, &is_iomem); + if (is_iomem) + iosys_map_set_vaddr_iomem(&bo->vmap, (void __iomem *)virtual); + else + iosys_map_set_vaddr(&bo->vmap, virtual); + + return 0; +} + +static void __xe_bo_vunmap(struct xe_bo *bo) +{ + if (!iosys_map_is_null(&bo->vmap)) { + iosys_map_clear(&bo->vmap); + ttm_bo_kunmap(&bo->kmap); + } +} + +void xe_bo_vunmap(struct xe_bo *bo) +{ + xe_bo_assert_held(bo); + __xe_bo_vunmap(bo); +} + +int xe_gem_create_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = to_xe_file(file); + struct drm_xe_gem_create *args = data; + struct xe_vm *vm = NULL; + struct xe_bo *bo; + unsigned int bo_flags; + u32 handle; + int err; + + if (XE_IOCTL_DBG(xe, args->extensions) || + XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) || + XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + /* at least one valid memory placement must be specified */ + if (XE_IOCTL_DBG(xe, (args->placement & ~xe->info.mem_region_mask) || + !args->placement)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->flags & + ~(DRM_XE_GEM_CREATE_FLAG_DEFER_BACKING | + DRM_XE_GEM_CREATE_FLAG_SCANOUT | + DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM))) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->handle)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, !args->size)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->size > SIZE_MAX)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->size & ~PAGE_MASK)) + return -EINVAL; + + bo_flags = 0; + if (args->flags & DRM_XE_GEM_CREATE_FLAG_DEFER_BACKING) + bo_flags |= XE_BO_DEFER_BACKING; + + if (args->flags & DRM_XE_GEM_CREATE_FLAG_SCANOUT) + bo_flags |= XE_BO_SCANOUT_BIT; + + bo_flags |= args->placement << (ffs(XE_BO_CREATE_SYSTEM_BIT) - 1); + + if (args->flags & DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM) { + if (XE_IOCTL_DBG(xe, !(bo_flags & XE_BO_CREATE_VRAM_MASK))) + return -EINVAL; + + bo_flags |= XE_BO_NEEDS_CPU_ACCESS; + } + + if (XE_IOCTL_DBG(xe, !args->cpu_caching || + args->cpu_caching > DRM_XE_GEM_CPU_CACHING_WC)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, bo_flags & XE_BO_CREATE_VRAM_MASK && + args->cpu_caching != DRM_XE_GEM_CPU_CACHING_WC)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, bo_flags & XE_BO_SCANOUT_BIT && + args->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) + return -EINVAL; + + if (args->vm_id) { + vm = xe_vm_lookup(xef, args->vm_id); + if (XE_IOCTL_DBG(xe, !vm)) + return -ENOENT; + err = xe_vm_lock(vm, true); + if (err) + goto out_vm; + } + + bo = xe_bo_create_user(xe, NULL, vm, args->size, args->cpu_caching, + ttm_bo_type_device, bo_flags); + + if (vm) + xe_vm_unlock(vm); + + if (IS_ERR(bo)) { + err = PTR_ERR(bo); + goto out_vm; + } + + err = drm_gem_handle_create(file, &bo->ttm.base, &handle); + if (err) + goto out_bulk; + + args->handle = handle; + goto out_put; + +out_bulk: + if (vm && !xe_vm_in_fault_mode(vm)) { + xe_vm_lock(vm, false); + __xe_bo_unset_bulk_move(bo); + xe_vm_unlock(vm); + } +out_put: + xe_bo_put(bo); +out_vm: + if (vm) + xe_vm_put(vm); + + return err; +} + +int xe_gem_mmap_offset_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct drm_xe_gem_mmap_offset *args = data; + struct drm_gem_object *gem_obj; + + if (XE_IOCTL_DBG(xe, args->extensions) || + XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->flags)) + return -EINVAL; + + gem_obj = drm_gem_object_lookup(file, args->handle); + if (XE_IOCTL_DBG(xe, !gem_obj)) + return -ENOENT; + + /* The mmap offset was set up at BO allocation time. */ + args->offset = drm_vma_node_offset_addr(&gem_obj->vma_node); + + xe_bo_put(gem_to_xe_bo(gem_obj)); + return 0; +} + +/** + * xe_bo_lock() - Lock the buffer object's dma_resv object + * @bo: The struct xe_bo whose lock is to be taken + * @intr: Whether to perform any wait interruptible + * + * Locks the buffer object's dma_resv object. If the buffer object is + * pointing to a shared dma_resv object, that shared lock is locked. + * + * Return: 0 on success, -EINTR if @intr is true and the wait for a + * contended lock was interrupted. If @intr is set to false, the + * function always returns 0. + */ +int xe_bo_lock(struct xe_bo *bo, bool intr) +{ + if (intr) + return dma_resv_lock_interruptible(bo->ttm.base.resv, NULL); + + dma_resv_lock(bo->ttm.base.resv, NULL); + + return 0; +} + +/** + * xe_bo_unlock() - Unlock the buffer object's dma_resv object + * @bo: The struct xe_bo whose lock is to be released. + * + * Unlock a buffer object lock that was locked by xe_bo_lock(). + */ +void xe_bo_unlock(struct xe_bo *bo) +{ + dma_resv_unlock(bo->ttm.base.resv); +} + +/** + * xe_bo_can_migrate - Whether a buffer object likely can be migrated + * @bo: The buffer object to migrate + * @mem_type: The TTM memory type intended to migrate to + * + * Check whether the buffer object supports migration to the + * given memory type. Note that pinning may affect the ability to migrate as + * returned by this function. + * + * This function is primarily intended as a helper for checking the + * possibility to migrate buffer objects and can be called without + * the object lock held. + * + * Return: true if migration is possible, false otherwise. + */ +bool xe_bo_can_migrate(struct xe_bo *bo, u32 mem_type) +{ + unsigned int cur_place; + + if (bo->ttm.type == ttm_bo_type_kernel) + return true; + + if (bo->ttm.type == ttm_bo_type_sg) + return false; + + for (cur_place = 0; cur_place < bo->placement.num_placement; + cur_place++) { + if (bo->placements[cur_place].mem_type == mem_type) + return true; + } + + return false; +} + +static void xe_place_from_ttm_type(u32 mem_type, struct ttm_place *place) +{ + memset(place, 0, sizeof(*place)); + place->mem_type = mem_type; +} + +/** + * xe_bo_migrate - Migrate an object to the desired region id + * @bo: The buffer object to migrate. + * @mem_type: The TTM region type to migrate to. + * + * Attempt to migrate the buffer object to the desired memory region. The + * buffer object may not be pinned, and must be locked. + * On successful completion, the object memory type will be updated, + * but an async migration task may not have completed yet, and to + * accomplish that, the object's kernel fences must be signaled with + * the object lock held. + * + * Return: 0 on success. Negative error code on failure. In particular may + * return -EINTR or -ERESTARTSYS if signal pending. + */ +int xe_bo_migrate(struct xe_bo *bo, u32 mem_type) +{ + struct xe_device *xe = ttm_to_xe_device(bo->ttm.bdev); + struct ttm_operation_ctx ctx = { + .interruptible = true, + .no_wait_gpu = false, + }; + struct ttm_placement placement; + struct ttm_place requested; + + xe_bo_assert_held(bo); + + if (bo->ttm.resource->mem_type == mem_type) + return 0; + + if (xe_bo_is_pinned(bo)) + return -EBUSY; + + if (!xe_bo_can_migrate(bo, mem_type)) + return -EINVAL; + + xe_place_from_ttm_type(mem_type, &requested); + placement.num_placement = 1; + placement.num_busy_placement = 1; + placement.placement = &requested; + placement.busy_placement = &requested; + + /* + * Stolen needs to be handled like below VRAM handling if we ever need + * to support it. + */ + drm_WARN_ON(&xe->drm, mem_type == XE_PL_STOLEN); + + if (mem_type_is_vram(mem_type)) { + u32 c = 0; + + add_vram(xe, bo, &requested, bo->flags, mem_type, &c); + } + + return ttm_bo_validate(&bo->ttm, &placement, &ctx); +} + +/** + * xe_bo_evict - Evict an object to evict placement + * @bo: The buffer object to migrate. + * @force_alloc: Set force_alloc in ttm_operation_ctx + * + * On successful completion, the object memory will be moved to evict + * placement. Ths function blocks until the object has been fully moved. + * + * Return: 0 on success. Negative error code on failure. + */ +int xe_bo_evict(struct xe_bo *bo, bool force_alloc) +{ + struct ttm_operation_ctx ctx = { + .interruptible = false, + .no_wait_gpu = false, + .force_alloc = force_alloc, + }; + struct ttm_placement placement; + int ret; + + xe_evict_flags(&bo->ttm, &placement); + ret = ttm_bo_validate(&bo->ttm, &placement, &ctx); + if (ret) + return ret; + + dma_resv_wait_timeout(bo->ttm.base.resv, DMA_RESV_USAGE_KERNEL, + false, MAX_SCHEDULE_TIMEOUT); + + return 0; +} + +/** + * xe_bo_needs_ccs_pages - Whether a bo needs to back up CCS pages when + * placed in system memory. + * @bo: The xe_bo + * + * Return: true if extra pages need to be allocated, false otherwise. + */ +bool xe_bo_needs_ccs_pages(struct xe_bo *bo) +{ + struct xe_device *xe = xe_bo_device(bo); + + if (!xe_device_has_flat_ccs(xe) || bo->ttm.type != ttm_bo_type_device) + return false; + + /* On discrete GPUs, if the GPU can access this buffer from + * system memory (i.e., it allows XE_PL_TT placement), FlatCCS + * can't be used since there's no CCS storage associated with + * non-VRAM addresses. + */ + if (IS_DGFX(xe) && (bo->flags & XE_BO_CREATE_SYSTEM_BIT)) + return false; + + return true; +} + +/** + * __xe_bo_release_dummy() - Dummy kref release function + * @kref: The embedded struct kref. + * + * Dummy release function for xe_bo_put_deferred(). Keep off. + */ +void __xe_bo_release_dummy(struct kref *kref) +{ +} + +/** + * xe_bo_put_commit() - Put bos whose put was deferred by xe_bo_put_deferred(). + * @deferred: The lockless list used for the call to xe_bo_put_deferred(). + * + * Puts all bos whose put was deferred by xe_bo_put_deferred(). + * The @deferred list can be either an onstack local list or a global + * shared list used by a workqueue. + */ +void xe_bo_put_commit(struct llist_head *deferred) +{ + struct llist_node *freed; + struct xe_bo *bo, *next; + + if (!deferred) + return; + + freed = llist_del_all(deferred); + if (!freed) + return; + + llist_for_each_entry_safe(bo, next, freed, freed) + drm_gem_object_free(&bo->ttm.base.refcount); +} + +/** + * xe_bo_dumb_create - Create a dumb bo as backing for a fb + * @file_priv: ... + * @dev: ... + * @args: ... + * + * See dumb_create() hook in include/drm/drm_drv.h + * + * Return: ... + */ +int xe_bo_dumb_create(struct drm_file *file_priv, + struct drm_device *dev, + struct drm_mode_create_dumb *args) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_bo *bo; + uint32_t handle; + int cpp = DIV_ROUND_UP(args->bpp, 8); + int err; + u32 page_size = max_t(u32, PAGE_SIZE, + xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K); + + args->pitch = ALIGN(args->width * cpp, 64); + args->size = ALIGN(mul_u32_u32(args->pitch, args->height), + page_size); + + bo = xe_bo_create_user(xe, NULL, NULL, args->size, + DRM_XE_GEM_CPU_CACHING_WC, + ttm_bo_type_device, + XE_BO_CREATE_VRAM_IF_DGFX(xe_device_get_root_tile(xe)) | + XE_BO_CREATE_USER_BIT | XE_BO_SCANOUT_BIT | + XE_BO_NEEDS_CPU_ACCESS); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + err = drm_gem_handle_create(file_priv, &bo->ttm.base, &handle); + /* drop reference from allocate - handle holds it now */ + drm_gem_object_put(&bo->ttm.base); + if (!err) + args->handle = handle; + return err; +} + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) +#include "tests/xe_bo.c" +#endif diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h new file mode 100644 index 000000000000..9b1279aca127 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -0,0 +1,355 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_BO_H_ +#define _XE_BO_H_ + +#include <drm/ttm/ttm_tt.h> + +#include "xe_bo_types.h" +#include "xe_macros.h" +#include "xe_vm_types.h" +#include "xe_vm.h" + +/** + * xe_vm_assert_held(vm) - Assert that the vm's reservation object is held. + * @vm: The vm + */ +#define xe_vm_assert_held(vm) dma_resv_assert_held(xe_vm_resv(vm)) + + + +#define XE_DEFAULT_GTT_SIZE_MB 3072ULL /* 3GB by default */ + +#define XE_BO_CREATE_USER_BIT BIT(0) +/* The bits below need to be contiguous, or things break */ +#define XE_BO_CREATE_SYSTEM_BIT BIT(1) +#define XE_BO_CREATE_VRAM0_BIT BIT(2) +#define XE_BO_CREATE_VRAM1_BIT BIT(3) +#define XE_BO_CREATE_VRAM_MASK (XE_BO_CREATE_VRAM0_BIT | \ + XE_BO_CREATE_VRAM1_BIT) +/* -- */ +#define XE_BO_CREATE_STOLEN_BIT BIT(4) +#define XE_BO_CREATE_VRAM_IF_DGFX(tile) \ + (IS_DGFX(tile_to_xe(tile)) ? XE_BO_CREATE_VRAM0_BIT << (tile)->id : \ + XE_BO_CREATE_SYSTEM_BIT) +#define XE_BO_CREATE_GGTT_BIT BIT(5) +#define XE_BO_CREATE_IGNORE_MIN_PAGE_SIZE_BIT BIT(6) +#define XE_BO_CREATE_PINNED_BIT BIT(7) +#define XE_BO_CREATE_NO_RESV_EVICT BIT(8) +#define XE_BO_DEFER_BACKING BIT(9) +#define XE_BO_SCANOUT_BIT BIT(10) +#define XE_BO_FIXED_PLACEMENT_BIT BIT(11) +#define XE_BO_PAGETABLE BIT(12) +#define XE_BO_NEEDS_CPU_ACCESS BIT(13) +/* this one is trigger internally only */ +#define XE_BO_INTERNAL_TEST BIT(30) +#define XE_BO_INTERNAL_64K BIT(31) + +#define XELPG_PPGTT_PTE_PAT3 BIT_ULL(62) +#define XE2_PPGTT_PTE_PAT4 BIT_ULL(61) +#define XE_PPGTT_PDE_PDPE_PAT2 BIT_ULL(12) +#define XE_PPGTT_PTE_PAT2 BIT_ULL(7) +#define XE_PPGTT_PTE_PAT1 BIT_ULL(4) +#define XE_PPGTT_PTE_PAT0 BIT_ULL(3) + +#define XE_PTE_SHIFT 12 +#define XE_PAGE_SIZE (1 << XE_PTE_SHIFT) +#define XE_PTE_MASK (XE_PAGE_SIZE - 1) +#define XE_PDE_SHIFT (XE_PTE_SHIFT - 3) +#define XE_PDES (1 << XE_PDE_SHIFT) +#define XE_PDE_MASK (XE_PDES - 1) + +#define XE_64K_PTE_SHIFT 16 +#define XE_64K_PAGE_SIZE (1 << XE_64K_PTE_SHIFT) +#define XE_64K_PTE_MASK (XE_64K_PAGE_SIZE - 1) +#define XE_64K_PDE_MASK (XE_PDE_MASK >> 4) + +#define XE_PDE_PS_2M BIT_ULL(7) +#define XE_PDPE_PS_1G BIT_ULL(7) +#define XE_PDE_IPS_64K BIT_ULL(11) + +#define XE_GGTT_PTE_DM BIT_ULL(1) +#define XE_USM_PPGTT_PTE_AE BIT_ULL(10) +#define XE_PPGTT_PTE_DM BIT_ULL(11) +#define XE_PDE_64K BIT_ULL(6) +#define XE_PTE_PS64 BIT_ULL(8) +#define XE_PTE_NULL BIT_ULL(9) + +#define XE_PAGE_PRESENT BIT_ULL(0) +#define XE_PAGE_RW BIT_ULL(1) + +#define XE_PL_SYSTEM TTM_PL_SYSTEM +#define XE_PL_TT TTM_PL_TT +#define XE_PL_VRAM0 TTM_PL_VRAM +#define XE_PL_VRAM1 (XE_PL_VRAM0 + 1) +#define XE_PL_STOLEN (TTM_NUM_MEM_TYPES - 1) + +#define XE_BO_PROPS_INVALID (-1) + +struct sg_table; + +struct xe_bo *xe_bo_alloc(void); +void xe_bo_free(struct xe_bo *bo); + +struct xe_bo *___xe_bo_create_locked(struct xe_device *xe, struct xe_bo *bo, + struct xe_tile *tile, struct dma_resv *resv, + struct ttm_lru_bulk_move *bulk, size_t size, + u16 cpu_caching, enum ttm_bo_type type, + u32 flags); +struct xe_bo * +xe_bo_create_locked_range(struct xe_device *xe, + struct xe_tile *tile, struct xe_vm *vm, + size_t size, u64 start, u64 end, + enum ttm_bo_type type, u32 flags); +struct xe_bo *xe_bo_create_locked(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, size_t size, + enum ttm_bo_type type, u32 flags); +struct xe_bo *xe_bo_create(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, size_t size, + enum ttm_bo_type type, u32 flags); +struct xe_bo *xe_bo_create_user(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, size_t size, + u16 cpu_caching, + enum ttm_bo_type type, + u32 flags); +struct xe_bo *xe_bo_create_pin_map(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, size_t size, + enum ttm_bo_type type, u32 flags); +struct xe_bo *xe_bo_create_pin_map_at(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm, size_t size, u64 offset, + enum ttm_bo_type type, u32 flags); +struct xe_bo *xe_bo_create_from_data(struct xe_device *xe, struct xe_tile *tile, + const void *data, size_t size, + enum ttm_bo_type type, u32 flags); +struct xe_bo *xe_managed_bo_create_pin_map(struct xe_device *xe, struct xe_tile *tile, + size_t size, u32 flags); +struct xe_bo *xe_managed_bo_create_from_data(struct xe_device *xe, struct xe_tile *tile, + const void *data, size_t size, u32 flags); + +int xe_bo_placement_for_flags(struct xe_device *xe, struct xe_bo *bo, + u32 bo_flags); + +static inline struct xe_bo *ttm_to_xe_bo(const struct ttm_buffer_object *bo) +{ + return container_of(bo, struct xe_bo, ttm); +} + +static inline struct xe_bo *gem_to_xe_bo(const struct drm_gem_object *obj) +{ + return container_of(obj, struct xe_bo, ttm.base); +} + +#define xe_bo_device(bo) ttm_to_xe_device((bo)->ttm.bdev) + +static inline struct xe_bo *xe_bo_get(struct xe_bo *bo) +{ + if (bo) + drm_gem_object_get(&bo->ttm.base); + + return bo; +} + +static inline void xe_bo_put(struct xe_bo *bo) +{ + if (bo) + drm_gem_object_put(&bo->ttm.base); +} + +static inline void __xe_bo_unset_bulk_move(struct xe_bo *bo) +{ + if (bo) + ttm_bo_set_bulk_move(&bo->ttm, NULL); +} + +static inline void xe_bo_assert_held(struct xe_bo *bo) +{ + if (bo) + dma_resv_assert_held((bo)->ttm.base.resv); +} + +int xe_bo_lock(struct xe_bo *bo, bool intr); + +void xe_bo_unlock(struct xe_bo *bo); + +static inline void xe_bo_unlock_vm_held(struct xe_bo *bo) +{ + if (bo) { + XE_WARN_ON(bo->vm && bo->ttm.base.resv != xe_vm_resv(bo->vm)); + if (bo->vm) + xe_vm_assert_held(bo->vm); + else + dma_resv_unlock(bo->ttm.base.resv); + } +} + +int xe_bo_pin_external(struct xe_bo *bo); +int xe_bo_pin(struct xe_bo *bo); +void xe_bo_unpin_external(struct xe_bo *bo); +void xe_bo_unpin(struct xe_bo *bo); +int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict); + +static inline bool xe_bo_is_pinned(struct xe_bo *bo) +{ + return bo->ttm.pin_count; +} + +static inline void xe_bo_unpin_map_no_vm(struct xe_bo *bo) +{ + if (likely(bo)) { + xe_bo_lock(bo, false); + xe_bo_unpin(bo); + xe_bo_unlock(bo); + + xe_bo_put(bo); + } +} + +bool xe_bo_is_xe_bo(struct ttm_buffer_object *bo); +dma_addr_t __xe_bo_addr(struct xe_bo *bo, u64 offset, size_t page_size); +dma_addr_t xe_bo_addr(struct xe_bo *bo, u64 offset, size_t page_size); + +static inline dma_addr_t +xe_bo_main_addr(struct xe_bo *bo, size_t page_size) +{ + return xe_bo_addr(bo, 0, page_size); +} + +static inline u32 +xe_bo_ggtt_addr(struct xe_bo *bo) +{ + XE_WARN_ON(bo->ggtt_node.size > bo->size); + XE_WARN_ON(bo->ggtt_node.start + bo->ggtt_node.size > (1ull << 32)); + return bo->ggtt_node.start; +} + +int xe_bo_vmap(struct xe_bo *bo); +void xe_bo_vunmap(struct xe_bo *bo); + +bool mem_type_is_vram(u32 mem_type); +bool xe_bo_is_vram(struct xe_bo *bo); +bool xe_bo_is_stolen(struct xe_bo *bo); +bool xe_bo_is_stolen_devmem(struct xe_bo *bo); +uint64_t vram_region_gpu_offset(struct ttm_resource *res); + +bool xe_bo_can_migrate(struct xe_bo *bo, u32 mem_type); + +int xe_bo_migrate(struct xe_bo *bo, u32 mem_type); +int xe_bo_evict(struct xe_bo *bo, bool force_alloc); + +int xe_bo_evict_pinned(struct xe_bo *bo); +int xe_bo_restore_pinned(struct xe_bo *bo); + +extern struct ttm_device_funcs xe_ttm_funcs; + +int xe_gem_create_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +int xe_gem_mmap_offset_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +int xe_bo_dumb_create(struct drm_file *file_priv, + struct drm_device *dev, + struct drm_mode_create_dumb *args); + +bool xe_bo_needs_ccs_pages(struct xe_bo *bo); + +static inline size_t xe_bo_ccs_pages_start(struct xe_bo *bo) +{ + return PAGE_ALIGN(bo->ttm.base.size); +} + +static inline bool xe_bo_has_pages(struct xe_bo *bo) +{ + if ((bo->ttm.ttm && ttm_tt_is_populated(bo->ttm.ttm)) || + xe_bo_is_vram(bo)) + return true; + + return false; +} + +void __xe_bo_release_dummy(struct kref *kref); + +/** + * xe_bo_put_deferred() - Put a buffer object with delayed final freeing + * @bo: The bo to put. + * @deferred: List to which to add the buffer object if we cannot put, or + * NULL if the function is to put unconditionally. + * + * Since the final freeing of an object includes both sleeping and (!) + * memory allocation in the dma_resv individualization, it's not ok + * to put an object from atomic context nor from within a held lock + * tainted by reclaim. In such situations we want to defer the final + * freeing until we've exited the restricting context, or in the worst + * case to a workqueue. + * This function either puts the object if possible without the refcount + * reaching zero, or adds it to the @deferred list if that was not possible. + * The caller needs to follow up with a call to xe_bo_put_commit() to actually + * put the bo iff this function returns true. It's safe to always + * follow up with a call to xe_bo_put_commit(). + * TODO: It's TTM that is the villain here. Perhaps TTM should add an + * interface like this. + * + * Return: true if @bo was the first object put on the @freed list, + * false otherwise. + */ +static inline bool +xe_bo_put_deferred(struct xe_bo *bo, struct llist_head *deferred) +{ + if (!deferred) { + xe_bo_put(bo); + return false; + } + + if (!kref_put(&bo->ttm.base.refcount, __xe_bo_release_dummy)) + return false; + + return llist_add(&bo->freed, deferred); +} + +void xe_bo_put_commit(struct llist_head *deferred); + +struct sg_table *xe_bo_sg(struct xe_bo *bo); + +/* + * xe_sg_segment_size() - Provides upper limit for sg segment size. + * @dev: device pointer + * + * Returns the maximum segment size for the 'struct scatterlist' + * elements. + */ +static inline unsigned int xe_sg_segment_size(struct device *dev) +{ + struct scatterlist __maybe_unused sg; + size_t max = BIT_ULL(sizeof(sg.length) * 8) - 1; + + max = min_t(size_t, max, dma_max_mapping_size(dev)); + + /* + * The iommu_dma_map_sg() function ensures iova allocation doesn't + * cross dma segment boundary. It does so by padding some sg elements. + * This can cause overflow, ending up with sg->length being set to 0. + * Avoid this by ensuring maximum segment size is half of 'max' + * rounded down to PAGE_SIZE. + */ + return round_down(max / 2, PAGE_SIZE); +} + +#define i915_gem_object_flush_if_display(obj) ((void)(obj)) + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) +/** + * xe_bo_is_mem_type - Whether the bo currently resides in the given + * TTM memory type + * @bo: The bo to check. + * @mem_type: The TTM memory type. + * + * Return: true iff the bo resides in @mem_type, false otherwise. + */ +static inline bool xe_bo_is_mem_type(struct xe_bo *bo, u32 mem_type) +{ + xe_bo_assert_held(bo); + return bo->ttm.resource->mem_type == mem_type; +} +#endif +#endif diff --git a/drivers/gpu/drm/xe/xe_bo_doc.h b/drivers/gpu/drm/xe/xe_bo_doc.h new file mode 100644 index 000000000000..f57d440cc95a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_bo_doc.h @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_BO_DOC_H_ +#define _XE_BO_DOC_H_ + +/** + * DOC: Buffer Objects (BO) + * + * BO management + * ============= + * + * TTM manages (placement, eviction, etc...) all BOs in XE. + * + * BO creation + * =========== + * + * Create a chunk of memory which can be used by the GPU. Placement rules + * (sysmem or vram region) passed in upon creation. TTM handles placement of BO + * and can trigger eviction of other BOs to make space for the new BO. + * + * Kernel BOs + * ---------- + * + * A kernel BO is created as part of driver load (e.g. uC firmware images, GuC + * ADS, etc...) or a BO created as part of a user operation which requires + * a kernel BO (e.g. engine state, memory for page tables, etc...). These BOs + * are typically mapped in the GGTT (any kernel BOs aside memory for page tables + * are in the GGTT), are pinned (can't move or be evicted at runtime), have a + * vmap (XE can access the memory via xe_map layer) and have contiguous physical + * memory. + * + * More details of why kernel BOs are pinned and contiguous below. + * + * User BOs + * -------- + * + * A user BO is created via the DRM_IOCTL_XE_GEM_CREATE IOCTL. Once it is + * created the BO can be mmap'd (via DRM_IOCTL_XE_GEM_MMAP_OFFSET) for user + * access and it can be bound for GPU access (via DRM_IOCTL_XE_VM_BIND). All + * user BOs are evictable and user BOs are never pinned by XE. The allocation of + * the backing store can be defered from creation time until first use which is + * either mmap, bind, or pagefault. + * + * Private BOs + * ~~~~~~~~~~~ + * + * A private BO is a user BO created with a valid VM argument passed into the + * create IOCTL. If a BO is private it cannot be exported via prime FD and + * mappings can only be created for the BO within the VM it is tied to. Lastly, + * the BO dma-resv slots / lock point to the VM's dma-resv slots / lock (all + * private BOs to a VM share common dma-resv slots / lock). + * + * External BOs + * ~~~~~~~~~~~~ + * + * An external BO is a user BO created with a NULL VM argument passed into the + * create IOCTL. An external BO can be shared with different UMDs / devices via + * prime FD and the BO can be mapped into multiple VMs. An external BO has its + * own unique dma-resv slots / lock. An external BO will be in an array of all + * VMs which has a mapping of the BO. This allows VMs to lookup and lock all + * external BOs mapped in the VM as needed. + * + * BO placement + * ~~~~~~~~~~~~ + * + * When a user BO is created, a mask of valid placements is passed indicating + * which memory regions are considered valid. + * + * The memory region information is available via query uAPI (TODO: add link). + * + * BO validation + * ============= + * + * BO validation (ttm_bo_validate) refers to ensuring a BO has a valid + * placement. If a BO was swapped to temporary storage, a validation call will + * trigger a move back to a valid (location where GPU can access BO) placement. + * Validation of a BO may evict other BOs to make room for the BO being + * validated. + * + * BO eviction / moving + * ==================== + * + * All eviction (or in other words, moving a BO from one memory location to + * another) is routed through TTM with a callback into XE. + * + * Runtime eviction + * ---------------- + * + * Runtime evictions refers to during normal operations where TTM decides it + * needs to move a BO. Typically this is because TTM needs to make room for + * another BO and the evicted BO is first BO on LRU list that is not locked. + * + * An example of this is a new BO which can only be placed in VRAM but there is + * not space in VRAM. There could be multiple BOs which have sysmem and VRAM + * placement rules which currently reside in VRAM, TTM trigger a will move of + * one (or multiple) of these BO(s) until there is room in VRAM to place the new + * BO. The evicted BO(s) are valid but still need new bindings before the BO + * used again (exec or compute mode rebind worker). + * + * Another example would be, TTM can't find a BO to evict which has another + * valid placement. In this case TTM will evict one (or multiple) unlocked BO(s) + * to a temporary unreachable (invalid) placement. The evicted BO(s) are invalid + * and before next use need to be moved to a valid placement and rebound. + * + * In both cases, moves of these BOs are scheduled behind the fences in the BO's + * dma-resv slots. + * + * WW locking tries to ensures if 2 VMs use 51% of the memory forward progress + * is made on both VMs. + * + * Runtime eviction uses per a GT migration engine (TODO: link to migration + * engine doc) to do a GPU memcpy from one location to another. + * + * Rebinds after runtime eviction + * ------------------------------ + * + * When BOs are moved, every mapping (VMA) of the BO needs to rebound before + * the BO is used again. Every VMA is added to an evicted list of its VM when + * the BO is moved. This is safe because of the VM locking structure (TODO: link + * to VM locking doc). On the next use of a VM (exec or compute mode rebind + * worker) the evicted VMA list is checked and rebinds are triggered. In the + * case of faulting VM, the rebind is done in the page fault handler. + * + * Suspend / resume eviction of VRAM + * --------------------------------- + * + * During device suspend / resume VRAM may lose power which means the contents + * of VRAM's memory is blown away. Thus BOs present in VRAM at the time of + * suspend must be moved to sysmem in order for their contents to be saved. + * + * A simple TTM call (ttm_resource_manager_evict_all) can move all non-pinned + * (user) BOs to sysmem. External BOs that are pinned need to be manually + * evicted with a simple loop + xe_bo_evict call. It gets a little trickier + * with kernel BOs. + * + * Some kernel BOs are used by the GT migration engine to do moves, thus we + * can't move all of the BOs via the GT migration engine. For simplity, use a + * TTM memcpy (CPU) to move any kernel (pinned) BO on either suspend or resume. + * + * Some kernel BOs need to be restored to the exact same physical location. TTM + * makes this rather easy but the caveat is the memory must be contiguous. Again + * for simplity, we enforce that all kernel (pinned) BOs are contiguous and + * restored to the same physical location. + * + * Pinned external BOs in VRAM are restored on resume via the GPU. + * + * Rebinds after suspend / resume + * ------------------------------ + * + * Most kernel BOs have GGTT mappings which must be restored during the resume + * process. All user BOs are rebound after validation on their next use. + * + * Future work + * =========== + * + * Trim the list of BOs which is saved / restored via TTM memcpy on suspend / + * resume. All we really need to save / restore via TTM memcpy is the memory + * required for the GuC to load and the memory for the GT migrate engine to + * operate. + * + * Do not require kernel BOs to be contiguous in physical memory / restored to + * the same physical address on resume. In all likelihood the only memory that + * needs to be restored to the same physical address is memory used for page + * tables. All of that memory is allocated 1 page at time so the contiguous + * requirement isn't needed. Some work on the vmap code would need to be done if + * kernel BOs are not contiguous too. + * + * Make some kernel BO evictable rather than pinned. An example of this would be + * engine state, in all likelihood if the dma-slots of these BOs where properly + * used rather than pinning we could safely evict + rebind these BOs as needed. + * + * Some kernel BOs do not need to be restored on resume (e.g. GuC ADS as that is + * repopulated on resume), add flag to mark such objects as no save / restore. + */ + +#endif diff --git a/drivers/gpu/drm/xe/xe_bo_evict.c b/drivers/gpu/drm/xe/xe_bo_evict.c new file mode 100644 index 000000000000..7a264a9ca06e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_bo_evict.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_bo_evict.h" + +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_ggtt.h" +#include "xe_tile.h" + +/** + * xe_bo_evict_all - evict all BOs from VRAM + * + * @xe: xe device + * + * Evict non-pinned user BOs first (via GPU), evict pinned external BOs next + * (via GPU), wait for evictions, and finally evict pinned kernel BOs via CPU. + * All eviction magic done via TTM calls. + * + * Evict == move VRAM BOs to temporary (typically system) memory. + * + * This function should be called before the device goes into a suspend state + * where the VRAM loses power. + */ +int xe_bo_evict_all(struct xe_device *xe) +{ + struct ttm_device *bdev = &xe->ttm; + struct xe_bo *bo; + struct xe_tile *tile; + struct list_head still_in_list; + u32 mem_type; + u8 id; + int ret; + + if (!IS_DGFX(xe)) + return 0; + + /* User memory */ + for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) { + struct ttm_resource_manager *man = + ttm_manager_type(bdev, mem_type); + + if (man) { + ret = ttm_resource_manager_evict_all(bdev, man); + if (ret) + return ret; + } + } + + /* Pinned user memory in VRAM */ + INIT_LIST_HEAD(&still_in_list); + spin_lock(&xe->pinned.lock); + for (;;) { + bo = list_first_entry_or_null(&xe->pinned.external_vram, + typeof(*bo), pinned_link); + if (!bo) + break; + xe_bo_get(bo); + list_move_tail(&bo->pinned_link, &still_in_list); + spin_unlock(&xe->pinned.lock); + + xe_bo_lock(bo, false); + ret = xe_bo_evict_pinned(bo); + xe_bo_unlock(bo); + xe_bo_put(bo); + if (ret) { + spin_lock(&xe->pinned.lock); + list_splice_tail(&still_in_list, + &xe->pinned.external_vram); + spin_unlock(&xe->pinned.lock); + return ret; + } + + spin_lock(&xe->pinned.lock); + } + list_splice_tail(&still_in_list, &xe->pinned.external_vram); + spin_unlock(&xe->pinned.lock); + + /* + * Wait for all user BO to be evicted as those evictions depend on the + * memory moved below. + */ + for_each_tile(tile, xe, id) + xe_tile_migrate_wait(tile); + + spin_lock(&xe->pinned.lock); + for (;;) { + bo = list_first_entry_or_null(&xe->pinned.kernel_bo_present, + typeof(*bo), pinned_link); + if (!bo) + break; + xe_bo_get(bo); + list_move_tail(&bo->pinned_link, &xe->pinned.evicted); + spin_unlock(&xe->pinned.lock); + + xe_bo_lock(bo, false); + ret = xe_bo_evict_pinned(bo); + xe_bo_unlock(bo); + xe_bo_put(bo); + if (ret) + return ret; + + spin_lock(&xe->pinned.lock); + } + spin_unlock(&xe->pinned.lock); + + return 0; +} + +/** + * xe_bo_restore_kernel - restore kernel BOs to VRAM + * + * @xe: xe device + * + * Move kernel BOs from temporary (typically system) memory to VRAM via CPU. All + * moves done via TTM calls. + * + * This function should be called early, before trying to init the GT, on device + * resume. + */ +int xe_bo_restore_kernel(struct xe_device *xe) +{ + struct xe_bo *bo; + int ret; + + if (!IS_DGFX(xe)) + return 0; + + spin_lock(&xe->pinned.lock); + for (;;) { + bo = list_first_entry_or_null(&xe->pinned.evicted, + typeof(*bo), pinned_link); + if (!bo) + break; + xe_bo_get(bo); + list_move_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present); + spin_unlock(&xe->pinned.lock); + + xe_bo_lock(bo, false); + ret = xe_bo_restore_pinned(bo); + xe_bo_unlock(bo); + if (ret) { + xe_bo_put(bo); + return ret; + } + + if (bo->flags & XE_BO_CREATE_GGTT_BIT) { + struct xe_tile *tile = bo->tile; + + mutex_lock(&tile->mem.ggtt->lock); + xe_ggtt_map_bo(tile->mem.ggtt, bo); + mutex_unlock(&tile->mem.ggtt->lock); + } + + /* + * We expect validate to trigger a move VRAM and our move code + * should setup the iosys map. + */ + xe_assert(xe, !iosys_map_is_null(&bo->vmap)); + xe_assert(xe, xe_bo_is_vram(bo)); + + xe_bo_put(bo); + + spin_lock(&xe->pinned.lock); + } + spin_unlock(&xe->pinned.lock); + + return 0; +} + +/** + * xe_bo_restore_user - restore pinned user BOs to VRAM + * + * @xe: xe device + * + * Move pinned user BOs from temporary (typically system) memory to VRAM via + * CPU. All moves done via TTM calls. + * + * This function should be called late, after GT init, on device resume. + */ +int xe_bo_restore_user(struct xe_device *xe) +{ + struct xe_bo *bo; + struct xe_tile *tile; + struct list_head still_in_list; + u8 id; + int ret; + + if (!IS_DGFX(xe)) + return 0; + + /* Pinned user memory in VRAM should be validated on resume */ + INIT_LIST_HEAD(&still_in_list); + spin_lock(&xe->pinned.lock); + for (;;) { + bo = list_first_entry_or_null(&xe->pinned.external_vram, + typeof(*bo), pinned_link); + if (!bo) + break; + list_move_tail(&bo->pinned_link, &still_in_list); + xe_bo_get(bo); + spin_unlock(&xe->pinned.lock); + + xe_bo_lock(bo, false); + ret = xe_bo_restore_pinned(bo); + xe_bo_unlock(bo); + xe_bo_put(bo); + if (ret) { + spin_lock(&xe->pinned.lock); + list_splice_tail(&still_in_list, + &xe->pinned.external_vram); + spin_unlock(&xe->pinned.lock); + return ret; + } + + spin_lock(&xe->pinned.lock); + } + list_splice_tail(&still_in_list, &xe->pinned.external_vram); + spin_unlock(&xe->pinned.lock); + + /* Wait for validate to complete */ + for_each_tile(tile, xe, id) + xe_tile_migrate_wait(tile); + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_bo_evict.h b/drivers/gpu/drm/xe/xe_bo_evict.h new file mode 100644 index 000000000000..746894798852 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_bo_evict.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_BO_EVICT_H_ +#define _XE_BO_EVICT_H_ + +struct xe_device; + +int xe_bo_evict_all(struct xe_device *xe); +int xe_bo_restore_kernel(struct xe_device *xe); +int xe_bo_restore_user(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h new file mode 100644 index 000000000000..64c2249a4e40 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_bo_types.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_BO_TYPES_H_ +#define _XE_BO_TYPES_H_ + +#include <linux/iosys-map.h> + +#include <drm/drm_mm.h> +#include <drm/ttm/ttm_bo.h> +#include <drm/ttm/ttm_device.h> +#include <drm/ttm/ttm_execbuf_util.h> +#include <drm/ttm/ttm_placement.h> + +struct xe_device; +struct xe_vm; + +#define XE_BO_MAX_PLACEMENTS 3 + +/* TODO: To be selected with VM_MADVISE */ +#define XE_BO_PRIORITY_NORMAL 1 + +/** @xe_bo: XE buffer object */ +struct xe_bo { + /** @ttm: TTM base buffer object */ + struct ttm_buffer_object ttm; + /** @size: Size of this buffer object */ + size_t size; + /** @flags: flags for this buffer object */ + u32 flags; + /** @vm: VM this BO is attached to, for extobj this will be NULL */ + struct xe_vm *vm; + /** @tile: Tile this BO is attached to (kernel BO only) */ + struct xe_tile *tile; + /** @placements: valid placements for this BO */ + struct ttm_place placements[XE_BO_MAX_PLACEMENTS]; + /** @placement: current placement for this BO */ + struct ttm_placement placement; + /** @ggtt_node: GGTT node if this BO is mapped in the GGTT */ + struct drm_mm_node ggtt_node; + /** @vmap: iosys map of this buffer */ + struct iosys_map vmap; + /** @ttm_kmap: TTM bo kmap object for internal use only. Keep off. */ + struct ttm_bo_kmap_obj kmap; + /** @pinned_link: link to present / evicted list of pinned BO */ + struct list_head pinned_link; +#ifdef CONFIG_PROC_FS + /** + * @client: @xe_drm_client which created the bo + */ + struct xe_drm_client *client; + /** + * @client_link: Link into @xe_drm_client.objects_list + */ + struct list_head client_link; +#endif + /** @props: BO user controlled properties */ + struct { + /** @preferred_mem: preferred memory class for this BO */ + s16 preferred_mem_class; + /** @prefered_gt: preferred GT for this BO */ + s16 preferred_gt; + /** @preferred_mem_type: preferred memory type */ + s32 preferred_mem_type; + /** + * @cpu_atomic: the CPU expects to do atomics operations to + * this BO + */ + bool cpu_atomic; + /** + * @device_atomic: the device expects to do atomics operations + * to this BO + */ + bool device_atomic; + } props; + /** @freed: List node for delayed put. */ + struct llist_node freed; + /** @created: Whether the bo has passed initial creation */ + bool created; + + /** @ccs_cleared */ + bool ccs_cleared; + + /** + * @cpu_caching: CPU caching mode. Currently only used for userspace + * objects. + */ + u16 cpu_caching; +}; + +#define intel_bo_to_drm_bo(bo) (&(bo)->ttm.base) +#define intel_bo_to_i915(bo) to_i915(intel_bo_to_drm_bo(bo)->dev) + +#endif diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c new file mode 100644 index 000000000000..c56fd7d59f05 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_debugfs.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_debugfs.h" + +#include <linux/string_helpers.h> + +#include <drm/drm_debugfs.h> + +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_gt_debugfs.h" +#include "xe_step.h" + +#ifdef CONFIG_DRM_XE_DEBUG +#include "xe_bo_evict.h" +#include "xe_migrate.h" +#include "xe_vm.h" +#endif + +#ifdef CONFIG_FAULT_INJECTION +#include <linux/fault-inject.h> /* XXX: fault-inject.h is broken */ +DECLARE_FAULT_ATTR(gt_reset_failure); +#endif + +static struct xe_device *node_to_xe(struct drm_info_node *node) +{ + return to_xe_device(node->minor->dev); +} + +static int info(struct seq_file *m, void *data) +{ + struct xe_device *xe = node_to_xe(m->private); + struct drm_printer p = drm_seq_file_printer(m); + struct xe_gt *gt; + u8 id; + + drm_printf(&p, "graphics_verx100 %d\n", xe->info.graphics_verx100); + drm_printf(&p, "media_verx100 %d\n", xe->info.media_verx100); + drm_printf(&p, "stepping G:%s M:%s D:%s B:%s\n", + xe_step_name(xe->info.step.graphics), + xe_step_name(xe->info.step.media), + xe_step_name(xe->info.step.display), + xe_step_name(xe->info.step.basedie)); + drm_printf(&p, "is_dgfx %s\n", str_yes_no(xe->info.is_dgfx)); + drm_printf(&p, "platform %d\n", xe->info.platform); + drm_printf(&p, "subplatform %d\n", + xe->info.subplatform > XE_SUBPLATFORM_NONE ? xe->info.subplatform : 0); + drm_printf(&p, "devid 0x%x\n", xe->info.devid); + drm_printf(&p, "revid %d\n", xe->info.revid); + drm_printf(&p, "tile_count %d\n", xe->info.tile_count); + drm_printf(&p, "vm_max_level %d\n", xe->info.vm_max_level); + drm_printf(&p, "force_execlist %s\n", str_yes_no(xe->info.force_execlist)); + drm_printf(&p, "has_flat_ccs %s\n", str_yes_no(xe->info.has_flat_ccs)); + drm_printf(&p, "has_usm %s\n", str_yes_no(xe->info.has_usm)); + for_each_gt(gt, xe, id) { + drm_printf(&p, "gt%d force wake %d\n", id, + xe_force_wake_ref(gt_to_fw(gt), XE_FW_GT)); + drm_printf(&p, "gt%d engine_mask 0x%llx\n", id, + gt->info.engine_mask); + } + + return 0; +} + +static const struct drm_info_list debugfs_list[] = { + {"info", info, 0}, +}; + +static int forcewake_open(struct inode *inode, struct file *file) +{ + struct xe_device *xe = inode->i_private; + struct xe_gt *gt; + u8 id; + + xe_device_mem_access_get(xe); + + for_each_gt(gt, xe, id) + XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + + return 0; +} + +static int forcewake_release(struct inode *inode, struct file *file) +{ + struct xe_device *xe = inode->i_private; + struct xe_gt *gt; + u8 id; + + for_each_gt(gt, xe, id) + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + + xe_device_mem_access_put(xe); + + return 0; +} + +static const struct file_operations forcewake_all_fops = { + .owner = THIS_MODULE, + .open = forcewake_open, + .release = forcewake_release, +}; + +void xe_debugfs_register(struct xe_device *xe) +{ + struct ttm_device *bdev = &xe->ttm; + struct drm_minor *minor = xe->drm.primary; + struct dentry *root = minor->debugfs_root; + struct ttm_resource_manager *man; + struct xe_gt *gt; + u32 mem_type; + u8 id; + + drm_debugfs_create_files(debugfs_list, + ARRAY_SIZE(debugfs_list), + root, minor); + + debugfs_create_file("forcewake_all", 0400, root, xe, + &forcewake_all_fops); + + for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) { + man = ttm_manager_type(bdev, mem_type); + + if (man) { + char name[16]; + + sprintf(name, "vram%d_mm", mem_type - XE_PL_VRAM0); + ttm_resource_manager_create_debugfs(man, root, name); + } + } + + man = ttm_manager_type(bdev, XE_PL_TT); + ttm_resource_manager_create_debugfs(man, root, "gtt_mm"); + + man = ttm_manager_type(bdev, XE_PL_STOLEN); + if (man) + ttm_resource_manager_create_debugfs(man, root, "stolen_mm"); + + for_each_gt(gt, xe, id) + xe_gt_debugfs_register(gt); + +#ifdef CONFIG_FAULT_INJECTION + fault_create_debugfs_attr("fail_gt_reset", root, >_reset_failure); +#endif + +} diff --git a/drivers/gpu/drm/xe/xe_debugfs.h b/drivers/gpu/drm/xe/xe_debugfs.h new file mode 100644 index 000000000000..715b8e2e0bd9 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_debugfs.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_DEBUGFS_H_ +#define _XE_DEBUGFS_H_ + +struct xe_device; + +void xe_debugfs_register(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c new file mode 100644 index 000000000000..68abc0b195be --- /dev/null +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_devcoredump.h" +#include "xe_devcoredump_types.h" + +#include <linux/devcoredump.h> +#include <generated/utsrelease.h> + +#include "xe_device.h" +#include "xe_exec_queue.h" +#include "xe_force_wake.h" +#include "xe_gt.h" +#include "xe_guc_ct.h" +#include "xe_guc_submit.h" +#include "xe_hw_engine.h" + +/** + * DOC: Xe device coredump + * + * Devices overview: + * Xe uses dev_coredump infrastructure for exposing the crash errors in a + * standardized way. + * devcoredump exposes a temporary device under /sys/class/devcoredump/ + * which is linked with our card device directly. + * The core dump can be accessed either from + * /sys/class/drm/card<n>/device/devcoredump/ or from + * /sys/class/devcoredump/devcd<m> where + * /sys/class/devcoredump/devcd<m>/failing_device is a link to + * /sys/class/drm/card<n>/device/. + * + * Snapshot at hang: + * The 'data' file is printed with a drm_printer pointer at devcoredump read + * time. For this reason, we need to take snapshots from when the hang has + * happened, and not only when the user is reading the file. Otherwise the + * information is outdated since the resets might have happened in between. + * + * 'First' failure snapshot: + * In general, the first hang is the most critical one since the following hangs + * can be a consequence of the initial hang. For this reason we only take the + * snapshot of the 'first' failure and ignore subsequent calls of this function, + * at least while the coredump device is alive. Dev_coredump has a delayed work + * queue that will eventually delete the device and free all the dump + * information. + */ + +#ifdef CONFIG_DEV_COREDUMP + +static struct xe_device *coredump_to_xe(const struct xe_devcoredump *coredump) +{ + return container_of(coredump, struct xe_device, devcoredump); +} + +static struct xe_guc *exec_queue_to_guc(struct xe_exec_queue *q) +{ + return &q->gt->uc.guc; +} + +static ssize_t xe_devcoredump_read(char *buffer, loff_t offset, + size_t count, void *data, size_t datalen) +{ + struct xe_devcoredump *coredump = data; + struct xe_devcoredump_snapshot *ss; + struct drm_printer p; + struct drm_print_iterator iter; + struct timespec64 ts; + int i; + + /* Our device is gone already... */ + if (!data || !coredump_to_xe(coredump)) + return -ENODEV; + + iter.data = buffer; + iter.offset = 0; + iter.start = offset; + iter.remain = count; + + ss = &coredump->snapshot; + p = drm_coredump_printer(&iter); + + drm_printf(&p, "**** Xe Device Coredump ****\n"); + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); + + ts = ktime_to_timespec64(ss->snapshot_time); + drm_printf(&p, "Snapshot time: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec); + ts = ktime_to_timespec64(ss->boot_time); + drm_printf(&p, "Uptime: %lld.%09ld\n", ts.tv_sec, ts.tv_nsec); + + drm_printf(&p, "\n**** GuC CT ****\n"); + xe_guc_ct_snapshot_print(coredump->snapshot.ct, &p); + xe_guc_exec_queue_snapshot_print(coredump->snapshot.ge, &p); + + drm_printf(&p, "\n**** HW Engines ****\n"); + for (i = 0; i < XE_NUM_HW_ENGINES; i++) + if (coredump->snapshot.hwe[i]) + xe_hw_engine_snapshot_print(coredump->snapshot.hwe[i], + &p); + + return count - iter.remain; +} + +static void xe_devcoredump_free(void *data) +{ + struct xe_devcoredump *coredump = data; + int i; + + /* Our device is gone. Nothing to do... */ + if (!data || !coredump_to_xe(coredump)) + return; + + xe_guc_ct_snapshot_free(coredump->snapshot.ct); + xe_guc_exec_queue_snapshot_free(coredump->snapshot.ge); + for (i = 0; i < XE_NUM_HW_ENGINES; i++) + if (coredump->snapshot.hwe[i]) + xe_hw_engine_snapshot_free(coredump->snapshot.hwe[i]); + + coredump->captured = false; + drm_info(&coredump_to_xe(coredump)->drm, + "Xe device coredump has been deleted.\n"); +} + +static void devcoredump_snapshot(struct xe_devcoredump *coredump, + struct xe_exec_queue *q) +{ + struct xe_devcoredump_snapshot *ss = &coredump->snapshot; + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + u32 adj_logical_mask = q->logical_mask; + u32 width_mask = (0x1 << q->width) - 1; + int i; + bool cookie; + + ss->snapshot_time = ktime_get_real(); + ss->boot_time = ktime_get_boottime(); + + cookie = dma_fence_begin_signalling(); + for (i = 0; q->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) { + if (adj_logical_mask & BIT(i)) { + adj_logical_mask |= width_mask << i; + i += q->width; + } else { + ++i; + } + } + + xe_force_wake_get(gt_to_fw(q->gt), XE_FORCEWAKE_ALL); + + coredump->snapshot.ct = xe_guc_ct_snapshot_capture(&guc->ct, true); + coredump->snapshot.ge = xe_guc_exec_queue_snapshot_capture(q); + + for_each_hw_engine(hwe, q->gt, id) { + if (hwe->class != q->hwe->class || + !(BIT(hwe->logical_instance) & adj_logical_mask)) { + coredump->snapshot.hwe[id] = NULL; + continue; + } + coredump->snapshot.hwe[id] = xe_hw_engine_snapshot_capture(hwe); + } + + xe_force_wake_put(gt_to_fw(q->gt), XE_FORCEWAKE_ALL); + dma_fence_end_signalling(cookie); +} + +/** + * xe_devcoredump - Take the required snapshots and initialize coredump device. + * @q: The faulty xe_exec_queue, where the issue was detected. + * + * This function should be called at the crash time within the serialized + * gt_reset. It is skipped if we still have the core dump device available + * with the information of the 'first' snapshot. + */ +void xe_devcoredump(struct xe_exec_queue *q) +{ + struct xe_device *xe = gt_to_xe(q->gt); + struct xe_devcoredump *coredump = &xe->devcoredump; + + if (coredump->captured) { + drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n"); + return; + } + + coredump->captured = true; + devcoredump_snapshot(coredump, q); + + drm_info(&xe->drm, "Xe device coredump has been created\n"); + drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n", + xe->drm.primary->index); + + dev_coredumpm(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL, + xe_devcoredump_read, xe_devcoredump_free); +} +#endif diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h new file mode 100644 index 000000000000..6ac218a5c194 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_devcoredump.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_DEVCOREDUMP_H_ +#define _XE_DEVCOREDUMP_H_ + +struct xe_device; +struct xe_exec_queue; + +#ifdef CONFIG_DEV_COREDUMP +void xe_devcoredump(struct xe_exec_queue *q); +#else +static inline void xe_devcoredump(struct xe_exec_queue *q) +{ +} +#endif + +#endif diff --git a/drivers/gpu/drm/xe/xe_devcoredump_types.h b/drivers/gpu/drm/xe/xe_devcoredump_types.h new file mode 100644 index 000000000000..7fdad9c3d3dd --- /dev/null +++ b/drivers/gpu/drm/xe/xe_devcoredump_types.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_DEVCOREDUMP_TYPES_H_ +#define _XE_DEVCOREDUMP_TYPES_H_ + +#include <linux/ktime.h> +#include <linux/mutex.h> + +#include "xe_hw_engine_types.h" + +struct xe_device; + +/** + * struct xe_devcoredump_snapshot - Crash snapshot + * + * This struct contains all the useful information quickly captured at the time + * of the crash. So, any subsequent reads of the coredump points to a data that + * shows the state of the GPU of when the issue has happened. + */ +struct xe_devcoredump_snapshot { + /** @snapshot_time: Time of this capture. */ + ktime_t snapshot_time; + /** @boot_time: Relative boot time so the uptime can be calculated. */ + ktime_t boot_time; + + /* GuC snapshots */ + /** @ct: GuC CT snapshot */ + struct xe_guc_ct_snapshot *ct; + /** @ge: Guc Engine snapshot */ + struct xe_guc_submit_exec_queue_snapshot *ge; + /** @hwe: HW Engine snapshot array */ + struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES]; +}; + +/** + * struct xe_devcoredump - Xe devcoredump main structure + * + * This struct represents the live and active dev_coredump node. + * It is created/populated at the time of a crash/error. Then it + * is read later when user access the device coredump data file + * for reading the information. + */ +struct xe_devcoredump { + /** @xe: Xe device. */ + struct xe_device *xe; + /** @captured: The snapshot of the first hang has already been taken. */ + bool captured; + /** @snapshot: Snapshot is captured at time of the first crash */ + struct xe_devcoredump_snapshot snapshot; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c new file mode 100644 index 000000000000..d9ae77fe7382 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_device.c @@ -0,0 +1,700 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_device.h" + +#include <linux/units.h> + +#include <drm/drm_aperture.h> +#include <drm/drm_atomic_helper.h> +#include <drm/drm_gem_ttm_helper.h> +#include <drm/drm_ioctl.h> +#include <drm/drm_managed.h> +#include <drm/drm_print.h> +#include <drm/xe_drm.h> + +#include "regs/xe_gt_regs.h" +#include "regs/xe_regs.h" +#include "xe_bo.h" +#include "xe_debugfs.h" +#include "xe_display.h" +#include "xe_dma_buf.h" +#include "xe_drm_client.h" +#include "xe_drv.h" +#include "xe_exec_queue.h" +#include "xe_exec.h" +#include "xe_ggtt.h" +#include "xe_gt.h" +#include "xe_gt_mcr.h" +#include "xe_irq.h" +#include "xe_mmio.h" +#include "xe_module.h" +#include "xe_pat.h" +#include "xe_pcode.h" +#include "xe_pm.h" +#include "xe_query.h" +#include "xe_tile.h" +#include "xe_ttm_stolen_mgr.h" +#include "xe_ttm_sys_mgr.h" +#include "xe_vm.h" +#include "xe_wait_user_fence.h" +#include "xe_hwmon.h" + +#ifdef CONFIG_LOCKDEP +struct lockdep_map xe_device_mem_access_lockdep_map = { + .name = "xe_device_mem_access_lockdep_map" +}; +#endif + +static int xe_file_open(struct drm_device *dev, struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_drm_client *client; + struct xe_file *xef; + int ret = -ENOMEM; + + xef = kzalloc(sizeof(*xef), GFP_KERNEL); + if (!xef) + return ret; + + client = xe_drm_client_alloc(); + if (!client) { + kfree(xef); + return ret; + } + + xef->drm = file; + xef->client = client; + xef->xe = xe; + + mutex_init(&xef->vm.lock); + xa_init_flags(&xef->vm.xa, XA_FLAGS_ALLOC1); + + mutex_init(&xef->exec_queue.lock); + xa_init_flags(&xef->exec_queue.xa, XA_FLAGS_ALLOC1); + + spin_lock(&xe->clients.lock); + xe->clients.count++; + spin_unlock(&xe->clients.lock); + + file->driver_priv = xef; + return 0; +} + +static void device_kill_persistent_exec_queues(struct xe_device *xe, + struct xe_file *xef); + +static void xe_file_close(struct drm_device *dev, struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = file->driver_priv; + struct xe_vm *vm; + struct xe_exec_queue *q; + unsigned long idx; + + mutex_lock(&xef->exec_queue.lock); + xa_for_each(&xef->exec_queue.xa, idx, q) { + xe_exec_queue_kill(q); + xe_exec_queue_put(q); + } + mutex_unlock(&xef->exec_queue.lock); + xa_destroy(&xef->exec_queue.xa); + mutex_destroy(&xef->exec_queue.lock); + device_kill_persistent_exec_queues(xe, xef); + + mutex_lock(&xef->vm.lock); + xa_for_each(&xef->vm.xa, idx, vm) + xe_vm_close_and_put(vm); + mutex_unlock(&xef->vm.lock); + xa_destroy(&xef->vm.xa); + mutex_destroy(&xef->vm.lock); + + spin_lock(&xe->clients.lock); + xe->clients.count--; + spin_unlock(&xe->clients.lock); + + xe_drm_client_put(xef->client); + kfree(xef); +} + +static const struct drm_ioctl_desc xe_ioctls[] = { + DRM_IOCTL_DEF_DRV(XE_DEVICE_QUERY, xe_query_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_GEM_CREATE, xe_gem_create_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_GEM_MMAP_OFFSET, xe_gem_mmap_offset_ioctl, + DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_VM_CREATE, xe_vm_create_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_VM_DESTROY, xe_vm_destroy_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_VM_BIND, xe_vm_bind_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_EXEC, xe_exec_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_EXEC_QUEUE_CREATE, xe_exec_queue_create_ioctl, + DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_EXEC_QUEUE_DESTROY, xe_exec_queue_destroy_ioctl, + DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_EXEC_QUEUE_GET_PROPERTY, xe_exec_queue_get_property_ioctl, + DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_WAIT_USER_FENCE, xe_wait_user_fence_ioctl, + DRM_RENDER_ALLOW), +}; + +static const struct file_operations xe_driver_fops = { + .owner = THIS_MODULE, + .open = drm_open, + .release = drm_release_noglobal, + .unlocked_ioctl = drm_ioctl, + .mmap = drm_gem_mmap, + .poll = drm_poll, + .read = drm_read, + .compat_ioctl = drm_compat_ioctl, + .llseek = noop_llseek, +#ifdef CONFIG_PROC_FS + .show_fdinfo = drm_show_fdinfo, +#endif +}; + +static void xe_driver_release(struct drm_device *dev) +{ + struct xe_device *xe = to_xe_device(dev); + + pci_set_drvdata(to_pci_dev(xe->drm.dev), NULL); +} + +static struct drm_driver driver = { + /* Don't use MTRRs here; the Xserver or userspace app should + * deal with them for Intel hardware. + */ + .driver_features = + DRIVER_GEM | + DRIVER_RENDER | DRIVER_SYNCOBJ | + DRIVER_SYNCOBJ_TIMELINE | DRIVER_GEM_GPUVA, + .open = xe_file_open, + .postclose = xe_file_close, + + .gem_prime_import = xe_gem_prime_import, + + .dumb_create = xe_bo_dumb_create, + .dumb_map_offset = drm_gem_ttm_dumb_map_offset, +#ifdef CONFIG_PROC_FS + .show_fdinfo = xe_drm_client_fdinfo, +#endif + .release = &xe_driver_release, + + .ioctls = xe_ioctls, + .num_ioctls = ARRAY_SIZE(xe_ioctls), + .fops = &xe_driver_fops, + .name = DRIVER_NAME, + .desc = DRIVER_DESC, + .date = DRIVER_DATE, + .major = DRIVER_MAJOR, + .minor = DRIVER_MINOR, + .patchlevel = DRIVER_PATCHLEVEL, +}; + +static void xe_device_destroy(struct drm_device *dev, void *dummy) +{ + struct xe_device *xe = to_xe_device(dev); + + if (xe->ordered_wq) + destroy_workqueue(xe->ordered_wq); + + if (xe->unordered_wq) + destroy_workqueue(xe->unordered_wq); + + ttm_device_fini(&xe->ttm); +} + +struct xe_device *xe_device_create(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct xe_device *xe; + int err; + + xe_display_driver_set_hooks(&driver); + + err = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &driver); + if (err) + return ERR_PTR(err); + + xe = devm_drm_dev_alloc(&pdev->dev, &driver, struct xe_device, drm); + if (IS_ERR(xe)) + return xe; + + err = ttm_device_init(&xe->ttm, &xe_ttm_funcs, xe->drm.dev, + xe->drm.anon_inode->i_mapping, + xe->drm.vma_offset_manager, false, false); + if (WARN_ON(err)) + goto err; + + err = drmm_add_action_or_reset(&xe->drm, xe_device_destroy, NULL); + if (err) + goto err; + + xe->info.devid = pdev->device; + xe->info.revid = pdev->revision; + xe->info.force_execlist = xe_modparam.force_execlist; + + spin_lock_init(&xe->irq.lock); + spin_lock_init(&xe->clients.lock); + + init_waitqueue_head(&xe->ufence_wq); + + drmm_mutex_init(&xe->drm, &xe->usm.lock); + xa_init_flags(&xe->usm.asid_to_vm, XA_FLAGS_ALLOC); + + if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) { + /* Trigger a large asid and an early asid wrap. */ + u32 asid; + + BUILD_BUG_ON(XE_MAX_ASID < 2); + err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, NULL, + XA_LIMIT(XE_MAX_ASID - 2, XE_MAX_ASID - 1), + &xe->usm.next_asid, GFP_KERNEL); + drm_WARN_ON(&xe->drm, err); + if (err >= 0) + xa_erase(&xe->usm.asid_to_vm, asid); + } + + drmm_mutex_init(&xe->drm, &xe->persistent_engines.lock); + INIT_LIST_HEAD(&xe->persistent_engines.list); + + spin_lock_init(&xe->pinned.lock); + INIT_LIST_HEAD(&xe->pinned.kernel_bo_present); + INIT_LIST_HEAD(&xe->pinned.external_vram); + INIT_LIST_HEAD(&xe->pinned.evicted); + + xe->ordered_wq = alloc_ordered_workqueue("xe-ordered-wq", 0); + xe->unordered_wq = alloc_workqueue("xe-unordered-wq", 0, 0); + if (!xe->ordered_wq || !xe->unordered_wq) { + drm_err(&xe->drm, "Failed to allocate xe workqueues\n"); + err = -ENOMEM; + goto err; + } + + err = xe_display_create(xe); + if (WARN_ON(err)) + goto err; + + return xe; + +err: + return ERR_PTR(err); +} + +/* + * The driver-initiated FLR is the highest level of reset that we can trigger + * from within the driver. It is different from the PCI FLR in that it doesn't + * fully reset the SGUnit and doesn't modify the PCI config space and therefore + * it doesn't require a re-enumeration of the PCI BARs. However, the + * driver-initiated FLR does still cause a reset of both GT and display and a + * memory wipe of local and stolen memory, so recovery would require a full HW + * re-init and saving/restoring (or re-populating) the wiped memory. Since we + * perform the FLR as the very last action before releasing access to the HW + * during the driver release flow, we don't attempt recovery at all, because + * if/when a new instance of i915 is bound to the device it will do a full + * re-init anyway. + */ +static void xe_driver_flr(struct xe_device *xe) +{ + const unsigned int flr_timeout = 3 * MICRO; /* specs recommend a 3s wait */ + struct xe_gt *gt = xe_root_mmio_gt(xe); + int ret; + + if (xe_mmio_read32(gt, GU_CNTL_PROTECTED) & DRIVERINT_FLR_DIS) { + drm_info_once(&xe->drm, "BIOS Disabled Driver-FLR\n"); + return; + } + + drm_dbg(&xe->drm, "Triggering Driver-FLR\n"); + + /* + * Make sure any pending FLR requests have cleared by waiting for the + * FLR trigger bit to go to zero. Also clear GU_DEBUG's DRIVERFLR_STATUS + * to make sure it's not still set from a prior attempt (it's a write to + * clear bit). + * Note that we should never be in a situation where a previous attempt + * is still pending (unless the HW is totally dead), but better to be + * safe in case something unexpected happens + */ + ret = xe_mmio_wait32(gt, GU_CNTL, DRIVERFLR, 0, flr_timeout, NULL, false); + if (ret) { + drm_err(&xe->drm, "Driver-FLR-prepare wait for ready failed! %d\n", ret); + return; + } + xe_mmio_write32(gt, GU_DEBUG, DRIVERFLR_STATUS); + + /* Trigger the actual Driver-FLR */ + xe_mmio_rmw32(gt, GU_CNTL, 0, DRIVERFLR); + + /* Wait for hardware teardown to complete */ + ret = xe_mmio_wait32(gt, GU_CNTL, DRIVERFLR, 0, flr_timeout, NULL, false); + if (ret) { + drm_err(&xe->drm, "Driver-FLR-teardown wait completion failed! %d\n", ret); + return; + } + + /* Wait for hardware/firmware re-init to complete */ + ret = xe_mmio_wait32(gt, GU_DEBUG, DRIVERFLR_STATUS, DRIVERFLR_STATUS, + flr_timeout, NULL, false); + if (ret) { + drm_err(&xe->drm, "Driver-FLR-reinit wait completion failed! %d\n", ret); + return; + } + + /* Clear sticky completion status */ + xe_mmio_write32(gt, GU_DEBUG, DRIVERFLR_STATUS); +} + +static void xe_driver_flr_fini(struct drm_device *drm, void *arg) +{ + struct xe_device *xe = arg; + + if (xe->needs_flr_on_fini) + xe_driver_flr(xe); +} + +static void xe_device_sanitize(struct drm_device *drm, void *arg) +{ + struct xe_device *xe = arg; + struct xe_gt *gt; + u8 id; + + for_each_gt(gt, xe, id) + xe_gt_sanitize(gt); +} + +static int xe_set_dma_info(struct xe_device *xe) +{ + unsigned int mask_size = xe->info.dma_mask_size; + int err; + + dma_set_max_seg_size(xe->drm.dev, xe_sg_segment_size(xe->drm.dev)); + + err = dma_set_mask(xe->drm.dev, DMA_BIT_MASK(mask_size)); + if (err) + goto mask_err; + + err = dma_set_coherent_mask(xe->drm.dev, DMA_BIT_MASK(mask_size)); + if (err) + goto mask_err; + + return 0; + +mask_err: + drm_err(&xe->drm, "Can't set DMA mask/consistent mask (%d)\n", err); + return err; +} + +/* + * Initialize MMIO resources that don't require any knowledge about tile count. + */ +int xe_device_probe_early(struct xe_device *xe) +{ + int err; + + err = xe_mmio_init(xe); + if (err) + return err; + + err = xe_mmio_root_tile_init(xe); + if (err) + return err; + + return 0; +} + +static int xe_device_set_has_flat_ccs(struct xe_device *xe) +{ + u32 reg; + int err; + + if (GRAPHICS_VER(xe) < 20 || !xe->info.has_flat_ccs) + return 0; + + struct xe_gt *gt = xe_root_mmio_gt(xe); + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + return err; + + reg = xe_gt_mcr_unicast_read_any(gt, XE2_FLAT_CCS_BASE_RANGE_LOWER); + xe->info.has_flat_ccs = (reg & XE2_FLAT_CCS_ENABLE); + + if (!xe->info.has_flat_ccs) + drm_dbg(&xe->drm, + "Flat CCS has been disabled in bios, May lead to performance impact"); + + return xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +} + +int xe_device_probe(struct xe_device *xe) +{ + struct xe_tile *tile; + struct xe_gt *gt; + int err; + u8 id; + + xe_pat_init_early(xe); + + xe->info.mem_region_mask = 1; + err = xe_display_init_nommio(xe); + if (err) + return err; + + err = xe_set_dma_info(xe); + if (err) + return err; + + xe_mmio_probe_tiles(xe); + + xe_ttm_sys_mgr_init(xe); + + for_each_gt(gt, xe, id) + xe_force_wake_init_gt(gt, gt_to_fw(gt)); + + for_each_tile(tile, xe, id) { + err = xe_ggtt_init_early(tile->mem.ggtt); + if (err) + return err; + } + + err = drmm_add_action_or_reset(&xe->drm, xe_driver_flr_fini, xe); + if (err) + return err; + + for_each_gt(gt, xe, id) { + err = xe_pcode_probe(gt); + if (err) + return err; + } + + err = xe_display_init_noirq(xe); + if (err) + return err; + + err = xe_irq_install(xe); + if (err) + goto err; + + for_each_gt(gt, xe, id) { + err = xe_gt_init_early(gt); + if (err) + goto err_irq_shutdown; + } + + err = xe_device_set_has_flat_ccs(xe); + if (err) + return err; + + err = xe_mmio_probe_vram(xe); + if (err) + goto err_irq_shutdown; + + for_each_tile(tile, xe, id) { + err = xe_tile_init_noalloc(tile); + if (err) + goto err_irq_shutdown; + } + + /* Allocate and map stolen after potential VRAM resize */ + xe_ttm_stolen_mgr_init(xe); + + /* + * Now that GT is initialized (TTM in particular), + * we can try to init display, and inherit the initial fb. + * This is the reason the first allocation needs to be done + * inside display. + */ + err = xe_display_init_noaccel(xe); + if (err) + goto err_irq_shutdown; + + for_each_gt(gt, xe, id) { + err = xe_gt_init(gt); + if (err) + goto err_irq_shutdown; + } + + xe_heci_gsc_init(xe); + + err = xe_display_init(xe); + if (err) + goto err_irq_shutdown; + + err = drm_dev_register(&xe->drm, 0); + if (err) + goto err_fini_display; + + xe_display_register(xe); + + xe_debugfs_register(xe); + + xe_hwmon_register(xe); + + err = drmm_add_action_or_reset(&xe->drm, xe_device_sanitize, xe); + if (err) + return err; + + return 0; + +err_fini_display: + xe_display_driver_remove(xe); + +err_irq_shutdown: + xe_irq_shutdown(xe); +err: + xe_display_fini(xe); + return err; +} + +static void xe_device_remove_display(struct xe_device *xe) +{ + xe_display_unregister(xe); + + drm_dev_unplug(&xe->drm); + xe_display_driver_remove(xe); +} + +void xe_device_remove(struct xe_device *xe) +{ + xe_device_remove_display(xe); + + xe_display_fini(xe); + + xe_heci_gsc_fini(xe); + + xe_irq_shutdown(xe); +} + +void xe_device_shutdown(struct xe_device *xe) +{ +} + +void xe_device_add_persistent_exec_queues(struct xe_device *xe, struct xe_exec_queue *q) +{ + mutex_lock(&xe->persistent_engines.lock); + list_add_tail(&q->persistent.link, &xe->persistent_engines.list); + mutex_unlock(&xe->persistent_engines.lock); +} + +void xe_device_remove_persistent_exec_queues(struct xe_device *xe, + struct xe_exec_queue *q) +{ + mutex_lock(&xe->persistent_engines.lock); + if (!list_empty(&q->persistent.link)) + list_del(&q->persistent.link); + mutex_unlock(&xe->persistent_engines.lock); +} + +static void device_kill_persistent_exec_queues(struct xe_device *xe, + struct xe_file *xef) +{ + struct xe_exec_queue *q, *next; + + mutex_lock(&xe->persistent_engines.lock); + list_for_each_entry_safe(q, next, &xe->persistent_engines.list, + persistent.link) + if (q->persistent.xef == xef) { + xe_exec_queue_kill(q); + list_del_init(&q->persistent.link); + } + mutex_unlock(&xe->persistent_engines.lock); +} + +void xe_device_wmb(struct xe_device *xe) +{ + struct xe_gt *gt = xe_root_mmio_gt(xe); + + wmb(); + if (IS_DGFX(xe)) + xe_mmio_write32(gt, SOFTWARE_FLAGS_SPR33, 0); +} + +u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size) +{ + return xe_device_has_flat_ccs(xe) ? + DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE(xe)) : 0; +} + +bool xe_device_mem_access_ongoing(struct xe_device *xe) +{ + if (xe_pm_read_callback_task(xe) != NULL) + return true; + + return atomic_read(&xe->mem_access.ref); +} + +void xe_device_assert_mem_access(struct xe_device *xe) +{ + XE_WARN_ON(!xe_device_mem_access_ongoing(xe)); +} + +bool xe_device_mem_access_get_if_ongoing(struct xe_device *xe) +{ + bool active; + + if (xe_pm_read_callback_task(xe) == current) + return true; + + active = xe_pm_runtime_get_if_active(xe); + if (active) { + int ref = atomic_inc_return(&xe->mem_access.ref); + + xe_assert(xe, ref != S32_MAX); + } + + return active; +} + +void xe_device_mem_access_get(struct xe_device *xe) +{ + int ref; + + /* + * This looks racy, but should be fine since the pm_callback_task only + * transitions from NULL -> current (and back to NULL again), during the + * runtime_resume() or runtime_suspend() callbacks, for which there can + * only be a single one running for our device. We only need to prevent + * recursively calling the runtime_get or runtime_put from those + * callbacks, as well as preventing triggering any access_ongoing + * asserts. + */ + if (xe_pm_read_callback_task(xe) == current) + return; + + /* + * Since the resume here is synchronous it can be quite easy to deadlock + * if we are not careful. Also in practice it might be quite timing + * sensitive to ever see the 0 -> 1 transition with the callers locks + * held, so deadlocks might exist but are hard for lockdep to ever see. + * With this in mind, help lockdep learn about the potentially scary + * stuff that can happen inside the runtime_resume callback by acquiring + * a dummy lock (it doesn't protect anything and gets compiled out on + * non-debug builds). Lockdep then only needs to see the + * mem_access_lockdep_map -> runtime_resume callback once, and then can + * hopefully validate all the (callers_locks) -> mem_access_lockdep_map. + * For example if the (callers_locks) are ever grabbed in the + * runtime_resume callback, lockdep should give us a nice splat. + */ + lock_map_acquire(&xe_device_mem_access_lockdep_map); + lock_map_release(&xe_device_mem_access_lockdep_map); + + xe_pm_runtime_get(xe); + ref = atomic_inc_return(&xe->mem_access.ref); + + xe_assert(xe, ref != S32_MAX); + +} + +void xe_device_mem_access_put(struct xe_device *xe) +{ + int ref; + + if (xe_pm_read_callback_task(xe) == current) + return; + + ref = atomic_dec_return(&xe->mem_access.ref); + xe_pm_runtime_put(xe); + + xe_assert(xe, ref >= 0); +} diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h new file mode 100644 index 000000000000..3da83b233206 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_device.h @@ -0,0 +1,173 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_DEVICE_H_ +#define _XE_DEVICE_H_ + +struct xe_exec_queue; +struct xe_file; + +#include <drm/drm_util.h> + +#include "regs/xe_gpu_commands.h" +#include "xe_device_types.h" +#include "xe_force_wake.h" +#include "xe_macros.h" + +#ifdef CONFIG_LOCKDEP +extern struct lockdep_map xe_device_mem_access_lockdep_map; +#endif + +static inline struct xe_device *to_xe_device(const struct drm_device *dev) +{ + return container_of(dev, struct xe_device, drm); +} + +static inline struct xe_device *pdev_to_xe_device(struct pci_dev *pdev) +{ + return pci_get_drvdata(pdev); +} + +static inline struct xe_device *ttm_to_xe_device(struct ttm_device *ttm) +{ + return container_of(ttm, struct xe_device, ttm); +} + +struct xe_device *xe_device_create(struct pci_dev *pdev, + const struct pci_device_id *ent); +int xe_device_probe_early(struct xe_device *xe); +int xe_device_probe(struct xe_device *xe); +void xe_device_remove(struct xe_device *xe); +void xe_device_shutdown(struct xe_device *xe); + +void xe_device_add_persistent_exec_queues(struct xe_device *xe, struct xe_exec_queue *q); +void xe_device_remove_persistent_exec_queues(struct xe_device *xe, + struct xe_exec_queue *q); + +void xe_device_wmb(struct xe_device *xe); + +static inline struct xe_file *to_xe_file(const struct drm_file *file) +{ + return file->driver_priv; +} + +static inline struct xe_tile *xe_device_get_root_tile(struct xe_device *xe) +{ + return &xe->tiles[0]; +} + +#define XE_MAX_GT_PER_TILE 2 + +static inline struct xe_gt *xe_tile_get_gt(struct xe_tile *tile, u8 gt_id) +{ + if (drm_WARN_ON(&tile_to_xe(tile)->drm, gt_id > XE_MAX_GT_PER_TILE)) + gt_id = 0; + + return gt_id ? tile->media_gt : tile->primary_gt; +} + +static inline struct xe_gt *xe_device_get_gt(struct xe_device *xe, u8 gt_id) +{ + struct xe_tile *root_tile = xe_device_get_root_tile(xe); + struct xe_gt *gt; + + /* + * FIXME: This only works for now because multi-tile and standalone + * media are mutually exclusive on the platforms we have today. + * + * id => GT mapping may change once we settle on how we want to handle + * our UAPI. + */ + if (MEDIA_VER(xe) >= 13) { + gt = xe_tile_get_gt(root_tile, gt_id); + } else { + if (drm_WARN_ON(&xe->drm, gt_id > XE_MAX_TILES_PER_DEVICE)) + gt_id = 0; + + gt = xe->tiles[gt_id].primary_gt; + } + + if (!gt) + return NULL; + + drm_WARN_ON(&xe->drm, gt->info.id != gt_id); + drm_WARN_ON(&xe->drm, gt->info.type == XE_GT_TYPE_UNINITIALIZED); + + return gt; +} + +/* + * Provide a GT structure suitable for performing non-GT MMIO operations against + * the primary tile. Primarily intended for early tile initialization, display + * handling, top-most interrupt enable/disable, etc. Since anything using the + * MMIO handle returned by this function doesn't need GSI offset translation, + * we'll return the primary GT from the root tile. + * + * FIXME: Fix the driver design so that 'gt' isn't the target of all MMIO + * operations. + * + * Returns the primary gt of the root tile. + */ +static inline struct xe_gt *xe_root_mmio_gt(struct xe_device *xe) +{ + return xe_device_get_root_tile(xe)->primary_gt; +} + +static inline bool xe_device_uc_enabled(struct xe_device *xe) +{ + return !xe->info.force_execlist; +} + +#define for_each_tile(tile__, xe__, id__) \ + for ((id__) = 0; (id__) < (xe__)->info.tile_count; (id__)++) \ + for_each_if((tile__) = &(xe__)->tiles[(id__)]) + +#define for_each_remote_tile(tile__, xe__, id__) \ + for ((id__) = 1; (id__) < (xe__)->info.tile_count; (id__)++) \ + for_each_if((tile__) = &(xe__)->tiles[(id__)]) + +/* + * FIXME: This only works for now since multi-tile and standalone media + * happen to be mutually exclusive. Future platforms may change this... + */ +#define for_each_gt(gt__, xe__, id__) \ + for ((id__) = 0; (id__) < (xe__)->info.gt_count; (id__)++) \ + for_each_if((gt__) = xe_device_get_gt((xe__), (id__))) + +static inline struct xe_force_wake *gt_to_fw(struct xe_gt *gt) +{ + return >->mmio.fw; +} + +void xe_device_mem_access_get(struct xe_device *xe); +bool xe_device_mem_access_get_if_ongoing(struct xe_device *xe); +void xe_device_mem_access_put(struct xe_device *xe); + +void xe_device_assert_mem_access(struct xe_device *xe); +bool xe_device_mem_access_ongoing(struct xe_device *xe); + +static inline bool xe_device_in_fault_mode(struct xe_device *xe) +{ + return xe->usm.num_vm_in_fault_mode != 0; +} + +static inline bool xe_device_in_non_fault_mode(struct xe_device *xe) +{ + return xe->usm.num_vm_in_non_fault_mode != 0; +} + +static inline bool xe_device_has_flat_ccs(struct xe_device *xe) +{ + return xe->info.has_flat_ccs; +} + +static inline bool xe_device_has_sriov(struct xe_device *xe) +{ + return xe->info.has_sriov; +} + +u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size); + +#endif diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.c b/drivers/gpu/drm/xe/xe_device_sysfs.c new file mode 100644 index 000000000000..99113a5a2b84 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_device_sysfs.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/kobject.h> +#include <linux/pci.h> +#include <linux/sysfs.h> + +#include <drm/drm_managed.h> + +#include "xe_device.h" +#include "xe_device_sysfs.h" +#include "xe_pm.h" + +/** + * DOC: Xe device sysfs + * Xe driver requires exposing certain tunable knobs controlled by user space for + * each graphics device. Considering this, we need to add sysfs attributes at device + * level granularity. + * These sysfs attributes will be available under pci device kobj directory. + * + * vram_d3cold_threshold - Report/change vram used threshold(in MB) below + * which vram save/restore is permissible during runtime D3cold entry/exit. + */ + +static ssize_t +vram_d3cold_threshold_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct xe_device *xe = pdev_to_xe_device(pdev); + int ret; + + if (!xe) + return -EINVAL; + + ret = sysfs_emit(buf, "%d\n", xe->d3cold.vram_threshold); + + return ret; +} + +static ssize_t +vram_d3cold_threshold_store(struct device *dev, struct device_attribute *attr, + const char *buff, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct xe_device *xe = pdev_to_xe_device(pdev); + u32 vram_d3cold_threshold; + int ret; + + if (!xe) + return -EINVAL; + + ret = kstrtou32(buff, 0, &vram_d3cold_threshold); + if (ret) + return ret; + + drm_dbg(&xe->drm, "vram_d3cold_threshold: %u\n", vram_d3cold_threshold); + + ret = xe_pm_set_vram_threshold(xe, vram_d3cold_threshold); + + return ret ?: count; +} + +static DEVICE_ATTR_RW(vram_d3cold_threshold); + +static void xe_device_sysfs_fini(struct drm_device *drm, void *arg) +{ + struct xe_device *xe = arg; + + sysfs_remove_file(&xe->drm.dev->kobj, &dev_attr_vram_d3cold_threshold.attr); +} + +void xe_device_sysfs_init(struct xe_device *xe) +{ + struct device *dev = xe->drm.dev; + int ret; + + ret = sysfs_create_file(&dev->kobj, &dev_attr_vram_d3cold_threshold.attr); + if (ret) { + drm_warn(&xe->drm, "Failed to create sysfs file\n"); + return; + } + + ret = drmm_add_action_or_reset(&xe->drm, xe_device_sysfs_fini, xe); + if (ret) + drm_warn(&xe->drm, "Failed to add sysfs fini drm action\n"); +} diff --git a/drivers/gpu/drm/xe/xe_device_sysfs.h b/drivers/gpu/drm/xe/xe_device_sysfs.h new file mode 100644 index 000000000000..38b240684bee --- /dev/null +++ b/drivers/gpu/drm/xe/xe_device_sysfs.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_DEVICE_SYSFS_H_ +#define _XE_DEVICE_SYSFS_H_ + +struct xe_device; + +void xe_device_sysfs_init(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h new file mode 100644 index 000000000000..c45ef17b3473 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -0,0 +1,545 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022-2023 Intel Corporation + */ + +#ifndef _XE_DEVICE_TYPES_H_ +#define _XE_DEVICE_TYPES_H_ + +#include <linux/pci.h> + +#include <drm/drm_device.h> +#include <drm/drm_file.h> +#include <drm/ttm/ttm_device.h> + +#include "xe_devcoredump_types.h" +#include "xe_heci_gsc.h" +#include "xe_gt_types.h" +#include "xe_lmtt_types.h" +#include "xe_platform_types.h" +#include "xe_pt_types.h" +#include "xe_sriov_types.h" +#include "xe_step_types.h" + +#if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) +#include "soc/intel_pch.h" +#include "intel_display_core.h" +#include "intel_display_device.h" +#endif + +struct xe_ggtt; +struct xe_pat_ops; + +#define XE_BO_INVALID_OFFSET LONG_MAX + +#define GRAPHICS_VER(xe) ((xe)->info.graphics_verx100 / 100) +#define MEDIA_VER(xe) ((xe)->info.media_verx100 / 100) +#define GRAPHICS_VERx100(xe) ((xe)->info.graphics_verx100) +#define MEDIA_VERx100(xe) ((xe)->info.media_verx100) +#define IS_DGFX(xe) ((xe)->info.is_dgfx) +#define HAS_HECI_GSCFI(xe) ((xe)->info.has_heci_gscfi) + +#define XE_VRAM_FLAGS_NEED64K BIT(0) + +#define XE_GT0 0 +#define XE_GT1 1 +#define XE_MAX_TILES_PER_DEVICE (XE_GT1 + 1) + +#define XE_MAX_ASID (BIT(20)) + +#define IS_PLATFORM_STEP(_xe, _platform, min_step, max_step) \ + ((_xe)->info.platform == (_platform) && \ + (_xe)->info.step.graphics >= (min_step) && \ + (_xe)->info.step.graphics < (max_step)) +#define IS_SUBPLATFORM_STEP(_xe, _platform, sub, min_step, max_step) \ + ((_xe)->info.platform == (_platform) && \ + (_xe)->info.subplatform == (sub) && \ + (_xe)->info.step.graphics >= (min_step) && \ + (_xe)->info.step.graphics < (max_step)) + +#define tile_to_xe(tile__) \ + _Generic(tile__, \ + const struct xe_tile * : (const struct xe_device *)((tile__)->xe), \ + struct xe_tile * : (tile__)->xe) + +/** + * struct xe_mem_region - memory region structure + * This is used to describe a memory region in xe + * device, such as HBM memory or CXL extension memory. + */ +struct xe_mem_region { + /** @io_start: IO start address of this VRAM instance */ + resource_size_t io_start; + /** + * @io_size: IO size of this VRAM instance + * + * This represents how much of this VRAM we can access + * via the CPU through the VRAM BAR. This can be smaller + * than @usable_size, in which case only part of VRAM is CPU + * accessible (typically the first 256M). This + * configuration is known as small-bar. + */ + resource_size_t io_size; + /** @dpa_base: This memory regions's DPA (device physical address) base */ + resource_size_t dpa_base; + /** + * @usable_size: usable size of VRAM + * + * Usable size of VRAM excluding reserved portions + * (e.g stolen mem) + */ + resource_size_t usable_size; + /** + * @actual_physical_size: Actual VRAM size + * + * Actual VRAM size including reserved portions + * (e.g stolen mem) + */ + resource_size_t actual_physical_size; + /** @mapping: pointer to VRAM mappable space */ + void *__iomem mapping; +}; + +/** + * struct xe_tile - hardware tile structure + * + * From a driver perspective, a "tile" is effectively a complete GPU, containing + * an SGunit, 1-2 GTs, and (for discrete platforms) VRAM. + * + * Multi-tile platforms effectively bundle multiple GPUs behind a single PCI + * device and designate one "root" tile as being responsible for external PCI + * communication. PCI BAR0 exposes the GGTT and MMIO register space for each + * tile in a stacked layout, and PCI BAR2 exposes the local memory associated + * with each tile similarly. Device-wide interrupts can be enabled/disabled + * at the root tile, and the MSTR_TILE_INTR register will report which tiles + * have interrupts that need servicing. + */ +struct xe_tile { + /** @xe: Backpointer to tile's PCI device */ + struct xe_device *xe; + + /** @id: ID of the tile */ + u8 id; + + /** + * @primary_gt: Primary GT + */ + struct xe_gt *primary_gt; + + /** + * @media_gt: Media GT + * + * Only present on devices with media version >= 13. + */ + struct xe_gt *media_gt; + + /** + * @mmio: MMIO info for a tile. + * + * Each tile has its own 16MB space in BAR0, laid out as: + * * 0-4MB: registers + * * 4MB-8MB: reserved + * * 8MB-16MB: global GTT + */ + struct { + /** @size: size of tile's MMIO space */ + size_t size; + + /** @regs: pointer to tile's MMIO space (starting with registers) */ + void *regs; + } mmio; + + /** + * @mmio_ext: MMIO-extension info for a tile. + * + * Each tile has its own additional 256MB (28-bit) MMIO-extension space. + */ + struct { + /** @size: size of tile's additional MMIO-extension space */ + size_t size; + + /** @regs: pointer to tile's additional MMIO-extension space */ + void *regs; + } mmio_ext; + + /** @mem: memory management info for tile */ + struct { + /** + * @vram: VRAM info for tile. + * + * Although VRAM is associated with a specific tile, it can + * still be accessed by all tiles' GTs. + */ + struct xe_mem_region vram; + + /** @vram_mgr: VRAM TTM manager */ + struct xe_ttm_vram_mgr *vram_mgr; + + /** @ggtt: Global graphics translation table */ + struct xe_ggtt *ggtt; + + /** + * @kernel_bb_pool: Pool from which batchbuffers are allocated. + * + * Media GT shares a pool with its primary GT. + */ + struct xe_sa_manager *kernel_bb_pool; + } mem; + + /** @sriov: tile level virtualization data */ + union { + struct { + /** @sriov.pf.lmtt: Local Memory Translation Table. */ + struct xe_lmtt lmtt; + } pf; + } sriov; + + /** @migrate: Migration helper for vram blits and clearing */ + struct xe_migrate *migrate; + + /** @sysfs: sysfs' kobj used by xe_tile_sysfs */ + struct kobject *sysfs; +}; + +/** + * struct xe_device - Top level struct of XE device + */ +struct xe_device { + /** @drm: drm device */ + struct drm_device drm; + + /** @devcoredump: device coredump */ + struct xe_devcoredump devcoredump; + + /** @info: device info */ + struct intel_device_info { + /** @graphics_name: graphics IP name */ + const char *graphics_name; + /** @media_name: media IP name */ + const char *media_name; + /** @tile_mmio_ext_size: size of MMIO extension space, per-tile */ + u32 tile_mmio_ext_size; + /** @graphics_verx100: graphics IP version */ + u32 graphics_verx100; + /** @media_verx100: media IP version */ + u32 media_verx100; + /** @mem_region_mask: mask of valid memory regions */ + u32 mem_region_mask; + /** @platform: XE platform enum */ + enum xe_platform platform; + /** @subplatform: XE subplatform enum */ + enum xe_subplatform subplatform; + /** @devid: device ID */ + u16 devid; + /** @revid: device revision */ + u8 revid; + /** @step: stepping information for each IP */ + struct xe_step_info step; + /** @dma_mask_size: DMA address bits */ + u8 dma_mask_size; + /** @vram_flags: Vram flags */ + u8 vram_flags; + /** @tile_count: Number of tiles */ + u8 tile_count; + /** @gt_count: Total number of GTs for entire device */ + u8 gt_count; + /** @vm_max_level: Max VM level */ + u8 vm_max_level; + /** @va_bits: Maximum bits of a virtual address */ + u8 va_bits; + + /** @is_dgfx: is discrete device */ + u8 is_dgfx:1; + /** @has_asid: Has address space ID */ + u8 has_asid:1; + /** @force_execlist: Forced execlist submission */ + u8 force_execlist:1; + /** @has_flat_ccs: Whether flat CCS metadata is used */ + u8 has_flat_ccs:1; + /** @has_llc: Device has a shared CPU+GPU last level cache */ + u8 has_llc:1; + /** @has_mmio_ext: Device has extra MMIO address range */ + u8 has_mmio_ext:1; + /** @has_range_tlb_invalidation: Has range based TLB invalidations */ + u8 has_range_tlb_invalidation:1; + /** @has_sriov: Supports SR-IOV */ + u8 has_sriov:1; + /** @has_usm: Device has unified shared memory support */ + u8 has_usm:1; + /** @enable_display: display enabled */ + u8 enable_display:1; + /** @skip_mtcfg: skip Multi-Tile configuration from MTCFG register */ + u8 skip_mtcfg:1; + /** @skip_pcode: skip access to PCODE uC */ + u8 skip_pcode:1; + /** @has_heci_gscfi: device has heci gscfi */ + u8 has_heci_gscfi:1; + /** @skip_guc_pc: Skip GuC based PM feature init */ + u8 skip_guc_pc:1; + +#if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) + struct { + u32 rawclk_freq; + } i915_runtime; +#endif + } info; + + /** @irq: device interrupt state */ + struct { + /** @lock: lock for processing irq's on this device */ + spinlock_t lock; + + /** @enabled: interrupts enabled on this device */ + bool enabled; + } irq; + + /** @ttm: ttm device */ + struct ttm_device ttm; + + /** @mmio: mmio info for device */ + struct { + /** @size: size of MMIO space for device */ + size_t size; + /** @regs: pointer to MMIO space for device */ + void *regs; + } mmio; + + /** @mem: memory info for device */ + struct { + /** @vram: VRAM info for device */ + struct xe_mem_region vram; + /** @sys_mgr: system TTM manager */ + struct ttm_resource_manager sys_mgr; + } mem; + + /** @sriov: device level virtualization data */ + struct { + /** @sriov.__mode: SR-IOV mode (Don't access directly!) */ + enum xe_sriov_mode __mode; + } sriov; + + /** @clients: drm clients info */ + struct { + /** @lock: Protects drm clients info */ + spinlock_t lock; + + /** @count: number of drm clients */ + u64 count; + } clients; + + /** @usm: unified memory state */ + struct { + /** @asid: convert a ASID to VM */ + struct xarray asid_to_vm; + /** @next_asid: next ASID, used to cyclical alloc asids */ + u32 next_asid; + /** @num_vm_in_fault_mode: number of VM in fault mode */ + u32 num_vm_in_fault_mode; + /** @num_vm_in_non_fault_mode: number of VM in non-fault mode */ + u32 num_vm_in_non_fault_mode; + /** @lock: protects UM state */ + struct mutex lock; + } usm; + + /** @persistent_engines: engines that are closed but still running */ + struct { + /** @lock: protects persistent engines */ + struct mutex lock; + /** @list: list of persistent engines */ + struct list_head list; + } persistent_engines; + + /** @pinned: pinned BO state */ + struct { + /** @lock: protected pinned BO list state */ + spinlock_t lock; + /** @evicted: pinned kernel BO that are present */ + struct list_head kernel_bo_present; + /** @evicted: pinned BO that have been evicted */ + struct list_head evicted; + /** @external_vram: pinned external BO in vram*/ + struct list_head external_vram; + } pinned; + + /** @ufence_wq: user fence wait queue */ + wait_queue_head_t ufence_wq; + + /** @ordered_wq: used to serialize compute mode resume */ + struct workqueue_struct *ordered_wq; + + /** @unordered_wq: used to serialize unordered work, mostly display */ + struct workqueue_struct *unordered_wq; + + /** @tiles: device tiles */ + struct xe_tile tiles[XE_MAX_TILES_PER_DEVICE]; + + /** + * @mem_access: keep track of memory access in the device, possibly + * triggering additional actions when they occur. + */ + struct { + /** @ref: ref count of memory accesses */ + atomic_t ref; + } mem_access; + + /** + * @pat: Encapsulate PAT related stuff + */ + struct { + /** Internal operations to abstract platforms */ + const struct xe_pat_ops *ops; + /** PAT table to program in the HW */ + const struct xe_pat_table_entry *table; + /** Number of PAT entries */ + int n_entries; + u32 idx[__XE_CACHE_LEVEL_COUNT]; + } pat; + + /** @d3cold: Encapsulate d3cold related stuff */ + struct { + /** capable: Indicates if root port is d3cold capable */ + bool capable; + + /** @allowed: Indicates if d3cold is a valid device state */ + bool allowed; + + /** @power_lost: Indicates if card has really lost power. */ + bool power_lost; + + /** + * @vram_threshold: + * + * This represents the permissible threshold(in megabytes) + * for vram save/restore. d3cold will be disallowed, + * when vram_usages is above or equals the threshold value + * to avoid the vram save/restore latency. + * Default threshold value is 300mb. + */ + u32 vram_threshold; + /** @lock: protect vram_threshold */ + struct mutex lock; + } d3cold; + + /** + * @pm_callback_task: Track the active task that is running in either + * the runtime_suspend or runtime_resume callbacks. + */ + struct task_struct *pm_callback_task; + + /** @hwmon: hwmon subsystem integration */ + struct xe_hwmon *hwmon; + + /** @heci_gsc: graphics security controller */ + struct xe_heci_gsc heci_gsc; + + /** @needs_flr_on_fini: requests function-reset on fini */ + bool needs_flr_on_fini; + + /* private: */ + +#if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) + /* + * Any fields below this point are the ones used by display. + * They are temporarily added here so xe_device can be desguised as + * drm_i915_private during build. After cleanup these should go away, + * migrating to the right sub-structs + */ + struct intel_display display; + enum intel_pch pch_type; + u16 pch_id; + + struct dram_info { + bool wm_lv_0_adjust_needed; + u8 num_channels; + bool symmetric_memory; + enum intel_dram_type { + INTEL_DRAM_UNKNOWN, + INTEL_DRAM_DDR3, + INTEL_DRAM_DDR4, + INTEL_DRAM_LPDDR3, + INTEL_DRAM_LPDDR4, + INTEL_DRAM_DDR5, + INTEL_DRAM_LPDDR5, + } type; + u8 num_qgv_points; + u8 num_psf_gv_points; + } dram_info; + + /* + * edram size in MB. + * Cannot be determined by PCIID. You must always read a register. + */ + u32 edram_size_mb; + + /* To shut up runtime pm macros.. */ + struct xe_runtime_pm {} runtime_pm; + + /* For pcode */ + struct mutex sb_lock; + + /* Should be in struct intel_display */ + u32 skl_preferred_vco_freq, max_dotclk_freq, hti_state; + u8 snps_phy_failed_calibration; + struct drm_atomic_state *modeset_restore_state; + struct list_head global_obj_list; + + union { + /* only to allow build, not used functionally */ + u32 irq_mask; + u32 de_irq_mask[I915_MAX_PIPES]; + }; + u32 pipestat_irq_mask[I915_MAX_PIPES]; + + bool display_irqs_enabled; + u32 enabled_irq_mask; + + struct intel_uncore { + spinlock_t lock; + } uncore; + + /* only to allow build, not used functionally */ + struct { + unsigned int hpll_freq; + unsigned int czclk_freq; + unsigned int fsb_freq, mem_freq, is_ddr3; + u8 vblank_enabled; + }; + struct { + const char *dmc_firmware_path; + } params; + + void *pxp; +#endif +}; + +/** + * struct xe_file - file handle for XE driver + */ +struct xe_file { + /** @xe: xe DEVICE **/ + struct xe_device *xe; + + /** @drm: base DRM file */ + struct drm_file *drm; + + /** @vm: VM state for file */ + struct { + /** @xe: xarray to store VMs */ + struct xarray xa; + /** @lock: protects file VM state */ + struct mutex lock; + } vm; + + /** @exec_queue: Submission exec queue state for file */ + struct { + /** @xe: xarray to store engines */ + struct xarray xa; + /** @lock: protects file engine state */ + struct mutex lock; + } exec_queue; + + /** @client: drm client */ + struct xe_drm_client *client; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_display.c b/drivers/gpu/drm/xe/xe_display.c new file mode 100644 index 000000000000..74391d9b11ae --- /dev/null +++ b/drivers/gpu/drm/xe/xe_display.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_display.h" +#include "regs/xe_regs.h" + +#include <linux/fb.h> + +#include <drm/drm_drv.h> +#include <drm/drm_managed.h> +#include <drm/xe_drm.h> + +#include "soc/intel_dram.h" +#include "i915_drv.h" /* FIXME: HAS_DISPLAY() depends on this */ +#include "intel_acpi.h" +#include "intel_audio.h" +#include "intel_bw.h" +#include "intel_display.h" +#include "intel_display_driver.h" +#include "intel_display_irq.h" +#include "intel_display_types.h" +#include "intel_dmc.h" +#include "intel_dp.h" +#include "intel_fbdev.h" +#include "intel_hdcp.h" +#include "intel_hotplug.h" +#include "intel_opregion.h" +#include "xe_module.h" + +/* Xe device functions */ + +static bool has_display(struct xe_device *xe) +{ + return HAS_DISPLAY(xe); +} + +/** + * xe_display_driver_probe_defer - Detect if we need to wait for other drivers + * early on + * @pdev: PCI device + * + * Returns: true if probe needs to be deferred, false otherwise + */ +bool xe_display_driver_probe_defer(struct pci_dev *pdev) +{ + if (!xe_modparam.enable_display) + return 0; + + return intel_display_driver_probe_defer(pdev); +} + +static void xe_display_last_close(struct drm_device *dev) +{ + struct xe_device *xe = to_xe_device(dev); + + if (xe->info.enable_display) + intel_fbdev_restore_mode(to_xe_device(dev)); +} + +/** + * xe_display_driver_set_hooks - Add driver flags and hooks for display + * @driver: DRM device driver + * + * Set features and function hooks in @driver that are needed for driving the + * display IP. This sets the driver's capability of driving display, regardless + * if the device has it enabled + */ +void xe_display_driver_set_hooks(struct drm_driver *driver) +{ + if (!xe_modparam.enable_display) + return; + + driver->driver_features |= DRIVER_MODESET | DRIVER_ATOMIC; + driver->lastclose = xe_display_last_close; +} + +static void unset_display_features(struct xe_device *xe) +{ + xe->drm.driver_features &= ~(DRIVER_MODESET | DRIVER_ATOMIC); +} + +static void display_destroy(struct drm_device *dev, void *dummy) +{ + struct xe_device *xe = to_xe_device(dev); + + destroy_workqueue(xe->display.hotplug.dp_wq); +} + +/** + * xe_display_create - create display struct + * @xe: XE device instance + * + * Initialize all fields used by the display part. + * + * TODO: once everything can be inside a single struct, make the struct opaque + * to the rest of xe and return it to be xe->display. + * + * Returns: 0 on success + */ +int xe_display_create(struct xe_device *xe) +{ + int err; + + spin_lock_init(&xe->display.fb_tracking.lock); + + xe->display.hotplug.dp_wq = alloc_ordered_workqueue("xe-dp", 0); + + drmm_mutex_init(&xe->drm, &xe->sb_lock); + drmm_mutex_init(&xe->drm, &xe->display.backlight.lock); + drmm_mutex_init(&xe->drm, &xe->display.audio.mutex); + drmm_mutex_init(&xe->drm, &xe->display.wm.wm_mutex); + drmm_mutex_init(&xe->drm, &xe->display.pps.mutex); + drmm_mutex_init(&xe->drm, &xe->display.hdcp.hdcp_mutex); + xe->enabled_irq_mask = ~0; + + err = drmm_add_action_or_reset(&xe->drm, display_destroy, NULL); + if (err) + return err; + + return 0; +} + +static void xe_display_fini_nommio(struct drm_device *dev, void *dummy) +{ + struct xe_device *xe = to_xe_device(dev); + + if (!xe->info.enable_display) + return; + + intel_power_domains_cleanup(xe); +} + +int xe_display_init_nommio(struct xe_device *xe) +{ + int err; + + if (!xe->info.enable_display) + return 0; + + /* Fake uncore lock */ + spin_lock_init(&xe->uncore.lock); + + /* This must be called before any calls to HAS_PCH_* */ + intel_detect_pch(xe); + + err = intel_power_domains_init(xe); + if (err) + return err; + + return drmm_add_action_or_reset(&xe->drm, xe_display_fini_nommio, xe); +} + +static void xe_display_fini_noirq(struct drm_device *dev, void *dummy) +{ + struct xe_device *xe = to_xe_device(dev); + + if (!xe->info.enable_display) + return; + + intel_display_driver_remove_noirq(xe); + intel_power_domains_driver_remove(xe); +} + +int xe_display_init_noirq(struct xe_device *xe) +{ + int err; + + if (!xe->info.enable_display) + return 0; + + intel_display_driver_early_probe(xe); + + /* Early display init.. */ + intel_opregion_setup(xe); + + /* + * Fill the dram structure to get the system dram info. This will be + * used for memory latency calculation. + */ + intel_dram_detect(xe); + + intel_bw_init_hw(xe); + + intel_display_device_info_runtime_init(xe); + + err = intel_display_driver_probe_noirq(xe); + if (err) + return err; + + return drmm_add_action_or_reset(&xe->drm, xe_display_fini_noirq, NULL); +} + +static void xe_display_fini_noaccel(struct drm_device *dev, void *dummy) +{ + struct xe_device *xe = to_xe_device(dev); + + if (!xe->info.enable_display) + return; + + intel_display_driver_remove_nogem(xe); +} + +int xe_display_init_noaccel(struct xe_device *xe) +{ + int err; + + if (!xe->info.enable_display) + return 0; + + err = intel_display_driver_probe_nogem(xe); + if (err) + return err; + + return drmm_add_action_or_reset(&xe->drm, xe_display_fini_noaccel, NULL); +} + +int xe_display_init(struct xe_device *xe) +{ + if (!xe->info.enable_display) + return 0; + + return intel_display_driver_probe(xe); +} + +void xe_display_fini(struct xe_device *xe) +{ + if (!xe->info.enable_display) + return; + + /* poll work can call into fbdev, hence clean that up afterwards */ + intel_hpd_poll_fini(xe); + intel_fbdev_fini(xe); + + intel_hdcp_component_fini(xe); + intel_audio_deinit(xe); +} + +void xe_display_register(struct xe_device *xe) +{ + if (!xe->info.enable_display) + return; + + intel_display_driver_register(xe); + intel_register_dsm_handler(); + intel_power_domains_enable(xe); +} + +void xe_display_unregister(struct xe_device *xe) +{ + if (!xe->info.enable_display) + return; + + intel_unregister_dsm_handler(); + intel_power_domains_disable(xe); + intel_display_driver_unregister(xe); +} + +void xe_display_driver_remove(struct xe_device *xe) +{ + if (!xe->info.enable_display) + return; + + intel_display_driver_remove(xe); + + intel_display_device_remove(xe); +} + +/* IRQ-related functions */ + +void xe_display_irq_handler(struct xe_device *xe, u32 master_ctl) +{ + if (!xe->info.enable_display) + return; + + if (master_ctl & DISPLAY_IRQ) + gen11_display_irq_handler(xe); +} + +void xe_display_irq_enable(struct xe_device *xe, u32 gu_misc_iir) +{ + if (!xe->info.enable_display) + return; + + if (gu_misc_iir & GU_MISC_GSE) + intel_opregion_asle_intr(xe); +} + +void xe_display_irq_reset(struct xe_device *xe) +{ + if (!xe->info.enable_display) + return; + + gen11_display_irq_reset(xe); +} + +void xe_display_irq_postinstall(struct xe_device *xe, struct xe_gt *gt) +{ + if (!xe->info.enable_display) + return; + + if (gt->info.id == XE_GT0) + gen11_de_irq_postinstall(xe); +} + +static void intel_suspend_encoders(struct xe_device *xe) +{ + struct drm_device *dev = &xe->drm; + struct intel_encoder *encoder; + + if (has_display(xe)) + return; + + drm_modeset_lock_all(dev); + for_each_intel_encoder(dev, encoder) + if (encoder->suspend) + encoder->suspend(encoder); + drm_modeset_unlock_all(dev); +} + +static bool suspend_to_idle(void) +{ +#if IS_ENABLED(CONFIG_ACPI_SLEEP) + if (acpi_target_system_state() < ACPI_STATE_S3) + return true; +#endif + return false; +} + +void xe_display_pm_suspend(struct xe_device *xe) +{ + bool s2idle = suspend_to_idle(); + if (!xe->info.enable_display) + return; + + /* + * We do a lot of poking in a lot of registers, make sure they work + * properly. + */ + intel_power_domains_disable(xe); + if (has_display(xe)) + drm_kms_helper_poll_disable(&xe->drm); + + intel_display_driver_suspend(xe); + + intel_dp_mst_suspend(xe); + + intel_hpd_cancel_work(xe); + + intel_suspend_encoders(xe); + + intel_opregion_suspend(xe, s2idle ? PCI_D1 : PCI_D3cold); + + intel_fbdev_set_suspend(&xe->drm, FBINFO_STATE_SUSPENDED, true); + + intel_dmc_suspend(xe); +} + +void xe_display_pm_suspend_late(struct xe_device *xe) +{ + bool s2idle = suspend_to_idle(); + if (!xe->info.enable_display) + return; + + intel_power_domains_suspend(xe, s2idle); + + intel_display_power_suspend_late(xe); +} + +void xe_display_pm_resume_early(struct xe_device *xe) +{ + if (!xe->info.enable_display) + return; + + intel_display_power_resume_early(xe); + + intel_power_domains_resume(xe); +} + +void xe_display_pm_resume(struct xe_device *xe) +{ + if (!xe->info.enable_display) + return; + + intel_dmc_resume(xe); + + if (has_display(xe)) + drm_mode_config_reset(&xe->drm); + + intel_display_driver_init_hw(xe); + intel_hpd_init(xe); + + /* MST sideband requires HPD interrupts enabled */ + intel_dp_mst_resume(xe); + intel_display_driver_resume(xe); + + intel_hpd_poll_disable(xe); + if (has_display(xe)) + drm_kms_helper_poll_enable(&xe->drm); + + intel_opregion_resume(xe); + + intel_fbdev_set_suspend(&xe->drm, FBINFO_STATE_RUNNING, false); + + intel_power_domains_enable(xe); +} + +void xe_display_probe(struct xe_device *xe) +{ + if (!xe->info.enable_display) + goto no_display; + + intel_display_device_probe(xe); + + if (has_display(xe)) + return; + +no_display: + xe->info.enable_display = false; + unset_display_features(xe); +} diff --git a/drivers/gpu/drm/xe/xe_display.h b/drivers/gpu/drm/xe/xe_display.h new file mode 100644 index 000000000000..710e56180b52 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_display.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_DISPLAY_H_ +#define _XE_DISPLAY_H_ + +#include "xe_device.h" + +struct drm_driver; + +#if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) + +bool xe_display_driver_probe_defer(struct pci_dev *pdev); +void xe_display_driver_set_hooks(struct drm_driver *driver); +void xe_display_driver_remove(struct xe_device *xe); + +int xe_display_create(struct xe_device *xe); + +void xe_display_probe(struct xe_device *xe); + +int xe_display_init_nommio(struct xe_device *xe); +int xe_display_init_noirq(struct xe_device *xe); +int xe_display_init_noaccel(struct xe_device *xe); +int xe_display_init(struct xe_device *xe); +void xe_display_fini(struct xe_device *xe); + +void xe_display_register(struct xe_device *xe); +void xe_display_unregister(struct xe_device *xe); + +void xe_display_irq_handler(struct xe_device *xe, u32 master_ctl); +void xe_display_irq_enable(struct xe_device *xe, u32 gu_misc_iir); +void xe_display_irq_reset(struct xe_device *xe); +void xe_display_irq_postinstall(struct xe_device *xe, struct xe_gt *gt); + +void xe_display_pm_suspend(struct xe_device *xe); +void xe_display_pm_suspend_late(struct xe_device *xe); +void xe_display_pm_resume_early(struct xe_device *xe); +void xe_display_pm_resume(struct xe_device *xe); + +#else + +static inline int xe_display_driver_probe_defer(struct pci_dev *pdev) { return 0; } +static inline void xe_display_driver_set_hooks(struct drm_driver *driver) { } +static inline void xe_display_driver_remove(struct xe_device *xe) {} + +static inline int xe_display_create(struct xe_device *xe) { return 0; } + +static inline void xe_display_probe(struct xe_device *xe) { } + +static inline int xe_display_init_nommio(struct xe_device *xe) { return 0; } +static inline int xe_display_init_noirq(struct xe_device *xe) { return 0; } +static inline int xe_display_init_noaccel(struct xe_device *xe) { return 0; } +static inline int xe_display_init(struct xe_device *xe) { return 0; } +static inline void xe_display_fini(struct xe_device *xe) {} + +static inline void xe_display_register(struct xe_device *xe) {} +static inline void xe_display_unregister(struct xe_device *xe) {} + +static inline void xe_display_irq_handler(struct xe_device *xe, u32 master_ctl) {} +static inline void xe_display_irq_enable(struct xe_device *xe, u32 gu_misc_iir) {} +static inline void xe_display_irq_reset(struct xe_device *xe) {} +static inline void xe_display_irq_postinstall(struct xe_device *xe, struct xe_gt *gt) {} + +static inline void xe_display_pm_suspend(struct xe_device *xe) {} +static inline void xe_display_pm_suspend_late(struct xe_device *xe) {} +static inline void xe_display_pm_resume_early(struct xe_device *xe) {} +static inline void xe_display_pm_resume(struct xe_device *xe) {} + +#endif /* CONFIG_DRM_XE_DISPLAY */ +#endif /* _XE_DISPLAY_H_ */ diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c new file mode 100644 index 000000000000..64ed303728fd --- /dev/null +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_dma_buf.h" + +#include <kunit/test.h> +#include <linux/dma-buf.h> +#include <linux/pci-p2pdma.h> + +#include <drm/drm_device.h> +#include <drm/drm_prime.h> +#include <drm/ttm/ttm_tt.h> + +#include "tests/xe_test.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_ttm_vram_mgr.h" +#include "xe_vm.h" + +MODULE_IMPORT_NS(DMA_BUF); + +static int xe_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attach) +{ + struct drm_gem_object *obj = attach->dmabuf->priv; + + if (attach->peer2peer && + pci_p2pdma_distance(to_pci_dev(obj->dev->dev), attach->dev, false) < 0) + attach->peer2peer = false; + + if (!attach->peer2peer && !xe_bo_can_migrate(gem_to_xe_bo(obj), XE_PL_TT)) + return -EOPNOTSUPP; + + xe_device_mem_access_get(to_xe_device(obj->dev)); + return 0; +} + +static void xe_dma_buf_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attach) +{ + struct drm_gem_object *obj = attach->dmabuf->priv; + + xe_device_mem_access_put(to_xe_device(obj->dev)); +} + +static int xe_dma_buf_pin(struct dma_buf_attachment *attach) +{ + struct drm_gem_object *obj = attach->dmabuf->priv; + struct xe_bo *bo = gem_to_xe_bo(obj); + struct xe_device *xe = xe_bo_device(bo); + int ret; + + /* + * For now only support pinning in TT memory, for two reasons: + * 1) Avoid pinning in a placement not accessible to some importers. + * 2) Pinning in VRAM requires PIN accounting which is a to-do. + */ + if (xe_bo_is_pinned(bo) && bo->ttm.resource->placement != XE_PL_TT) { + drm_dbg(&xe->drm, "Can't migrate pinned bo for dma-buf pin.\n"); + return -EINVAL; + } + + ret = xe_bo_migrate(bo, XE_PL_TT); + if (ret) { + if (ret != -EINTR && ret != -ERESTARTSYS) + drm_dbg(&xe->drm, + "Failed migrating dma-buf to TT memory: %pe\n", + ERR_PTR(ret)); + return ret; + } + + ret = xe_bo_pin_external(bo); + xe_assert(xe, !ret); + + return 0; +} + +static void xe_dma_buf_unpin(struct dma_buf_attachment *attach) +{ + struct drm_gem_object *obj = attach->dmabuf->priv; + struct xe_bo *bo = gem_to_xe_bo(obj); + + xe_bo_unpin_external(bo); +} + +static struct sg_table *xe_dma_buf_map(struct dma_buf_attachment *attach, + enum dma_data_direction dir) +{ + struct dma_buf *dma_buf = attach->dmabuf; + struct drm_gem_object *obj = dma_buf->priv; + struct xe_bo *bo = gem_to_xe_bo(obj); + struct sg_table *sgt; + int r = 0; + + if (!attach->peer2peer && !xe_bo_can_migrate(bo, XE_PL_TT)) + return ERR_PTR(-EOPNOTSUPP); + + if (!xe_bo_is_pinned(bo)) { + if (!attach->peer2peer) + r = xe_bo_migrate(bo, XE_PL_TT); + else + r = xe_bo_validate(bo, NULL, false); + if (r) + return ERR_PTR(r); + } + + switch (bo->ttm.resource->mem_type) { + case XE_PL_TT: + sgt = drm_prime_pages_to_sg(obj->dev, + bo->ttm.ttm->pages, + bo->ttm.ttm->num_pages); + if (IS_ERR(sgt)) + return sgt; + + if (dma_map_sgtable(attach->dev, sgt, dir, + DMA_ATTR_SKIP_CPU_SYNC)) + goto error_free; + break; + + case XE_PL_VRAM0: + case XE_PL_VRAM1: + r = xe_ttm_vram_mgr_alloc_sgt(xe_bo_device(bo), + bo->ttm.resource, 0, + bo->ttm.base.size, attach->dev, + dir, &sgt); + if (r) + return ERR_PTR(r); + break; + default: + return ERR_PTR(-EINVAL); + } + + return sgt; + +error_free: + sg_free_table(sgt); + kfree(sgt); + return ERR_PTR(-EBUSY); +} + +static void xe_dma_buf_unmap(struct dma_buf_attachment *attach, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct dma_buf *dma_buf = attach->dmabuf; + struct xe_bo *bo = gem_to_xe_bo(dma_buf->priv); + + if (!xe_bo_is_vram(bo)) { + dma_unmap_sgtable(attach->dev, sgt, dir, 0); + sg_free_table(sgt); + kfree(sgt); + } else { + xe_ttm_vram_mgr_free_sgt(attach->dev, dir, sgt); + } +} + +static int xe_dma_buf_begin_cpu_access(struct dma_buf *dma_buf, + enum dma_data_direction direction) +{ + struct drm_gem_object *obj = dma_buf->priv; + struct xe_bo *bo = gem_to_xe_bo(obj); + bool reads = (direction == DMA_BIDIRECTIONAL || + direction == DMA_FROM_DEVICE); + + if (!reads) + return 0; + + /* Can we do interruptible lock here? */ + xe_bo_lock(bo, false); + (void)xe_bo_migrate(bo, XE_PL_TT); + xe_bo_unlock(bo); + + return 0; +} + +const struct dma_buf_ops xe_dmabuf_ops = { + .attach = xe_dma_buf_attach, + .detach = xe_dma_buf_detach, + .pin = xe_dma_buf_pin, + .unpin = xe_dma_buf_unpin, + .map_dma_buf = xe_dma_buf_map, + .unmap_dma_buf = xe_dma_buf_unmap, + .release = drm_gem_dmabuf_release, + .begin_cpu_access = xe_dma_buf_begin_cpu_access, + .mmap = drm_gem_dmabuf_mmap, + .vmap = drm_gem_dmabuf_vmap, + .vunmap = drm_gem_dmabuf_vunmap, +}; + +struct dma_buf *xe_gem_prime_export(struct drm_gem_object *obj, int flags) +{ + struct xe_bo *bo = gem_to_xe_bo(obj); + struct dma_buf *buf; + + if (bo->vm) + return ERR_PTR(-EPERM); + + buf = drm_gem_prime_export(obj, flags); + if (!IS_ERR(buf)) + buf->ops = &xe_dmabuf_ops; + + return buf; +} + +static struct drm_gem_object * +xe_dma_buf_init_obj(struct drm_device *dev, struct xe_bo *storage, + struct dma_buf *dma_buf) +{ + struct dma_resv *resv = dma_buf->resv; + struct xe_device *xe = to_xe_device(dev); + struct xe_bo *bo; + int ret; + + dma_resv_lock(resv, NULL); + bo = ___xe_bo_create_locked(xe, storage, NULL, resv, NULL, dma_buf->size, + 0, /* Will require 1way or 2way for vm_bind */ + ttm_bo_type_sg, XE_BO_CREATE_SYSTEM_BIT); + if (IS_ERR(bo)) { + ret = PTR_ERR(bo); + goto error; + } + dma_resv_unlock(resv); + + return &bo->ttm.base; + +error: + dma_resv_unlock(resv); + return ERR_PTR(ret); +} + +static void xe_dma_buf_move_notify(struct dma_buf_attachment *attach) +{ + struct drm_gem_object *obj = attach->importer_priv; + struct xe_bo *bo = gem_to_xe_bo(obj); + + XE_WARN_ON(xe_bo_evict(bo, false)); +} + +static const struct dma_buf_attach_ops xe_dma_buf_attach_ops = { + .allow_peer2peer = true, + .move_notify = xe_dma_buf_move_notify +}; + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) + +struct dma_buf_test_params { + struct xe_test_priv base; + const struct dma_buf_attach_ops *attach_ops; + bool force_different_devices; + u32 mem_mask; +}; + +#define to_dma_buf_test_params(_priv) \ + container_of(_priv, struct dma_buf_test_params, base) +#endif + +struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, + struct dma_buf *dma_buf) +{ + XE_TEST_DECLARE(struct dma_buf_test_params *test = + to_dma_buf_test_params + (xe_cur_kunit_priv(XE_TEST_LIVE_DMA_BUF));) + const struct dma_buf_attach_ops *attach_ops; + struct dma_buf_attachment *attach; + struct drm_gem_object *obj; + struct xe_bo *bo; + + if (dma_buf->ops == &xe_dmabuf_ops) { + obj = dma_buf->priv; + if (obj->dev == dev && + !XE_TEST_ONLY(test && test->force_different_devices)) { + /* + * Importing dmabuf exported from out own gem increases + * refcount on gem itself instead of f_count of dmabuf. + */ + drm_gem_object_get(obj); + return obj; + } + } + + /* + * Don't publish the bo until we have a valid attachment, and a + * valid attachment needs the bo address. So pre-create a bo before + * creating the attachment and publish. + */ + bo = xe_bo_alloc(); + if (IS_ERR(bo)) + return ERR_CAST(bo); + + attach_ops = &xe_dma_buf_attach_ops; +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) + if (test) + attach_ops = test->attach_ops; +#endif + + attach = dma_buf_dynamic_attach(dma_buf, dev->dev, attach_ops, &bo->ttm.base); + if (IS_ERR(attach)) { + obj = ERR_CAST(attach); + goto out_err; + } + + /* Errors here will take care of freeing the bo. */ + obj = xe_dma_buf_init_obj(dev, bo, dma_buf); + if (IS_ERR(obj)) + return obj; + + + get_dma_buf(dma_buf); + obj->import_attach = attach; + return obj; + +out_err: + xe_bo_free(bo); + + return obj; +} + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) +#include "tests/xe_dma_buf.c" +#endif diff --git a/drivers/gpu/drm/xe/xe_dma_buf.h b/drivers/gpu/drm/xe/xe_dma_buf.h new file mode 100644 index 000000000000..861dd28a862c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_dma_buf.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_DMA_BUF_H_ +#define _XE_DMA_BUF_H_ + +#include <drm/drm_gem.h> + +struct dma_buf *xe_gem_prime_export(struct drm_gem_object *obj, int flags); +struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, + struct dma_buf *dma_buf); + +#endif diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c new file mode 100644 index 000000000000..82d1305e831f --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_client.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <drm/drm_print.h> +#include <drm/xe_drm.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/types.h> + +#include "xe_bo.h" +#include "xe_bo_types.h" +#include "xe_device_types.h" +#include "xe_drm_client.h" +#include "xe_trace.h" + +/** + * xe_drm_client_alloc() - Allocate drm client + * @void: No arg + * + * Allocate drm client struct to track client memory against + * same till client life. Call this API whenever new client + * has opened xe device. + * + * Return: pointer to client struct or NULL if can't allocate + */ +struct xe_drm_client *xe_drm_client_alloc(void) +{ + struct xe_drm_client *client; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (!client) + return NULL; + + kref_init(&client->kref); + +#ifdef CONFIG_PROC_FS + spin_lock_init(&client->bos_lock); + INIT_LIST_HEAD(&client->bos_list); +#endif + return client; +} + +/** + * __xe_drm_client_free() - Free client struct + * @kref: The reference + * + * This frees client struct. Call this API when xe device is closed + * by drm client. + * + * Return: void + */ +void __xe_drm_client_free(struct kref *kref) +{ + struct xe_drm_client *client = + container_of(kref, typeof(*client), kref); + + kfree(client); +} + +#ifdef CONFIG_PROC_FS +/** + * xe_drm_client_add_bo() - Add BO for tracking client mem usage + * @client: The drm client ptr + * @bo: The xe BO ptr + * + * Add all BO created by individual drm client by calling this function. + * This helps in tracking client memory usage. + * + * Return: void + */ +void xe_drm_client_add_bo(struct xe_drm_client *client, + struct xe_bo *bo) +{ + XE_WARN_ON(bo->client); + XE_WARN_ON(!list_empty(&bo->client_link)); + + spin_lock(&client->bos_lock); + bo->client = xe_drm_client_get(client); + list_add_tail_rcu(&bo->client_link, &client->bos_list); + spin_unlock(&client->bos_lock); +} + +/** + * xe_drm_client_remove_bo() - Remove BO for tracking client mem usage + * @bo: The xe BO ptr + * + * Remove all BO removed by individual drm client by calling this function. + * This helps in tracking client memory usage. + * + * Return: void + */ +void xe_drm_client_remove_bo(struct xe_bo *bo) +{ + struct xe_drm_client *client = bo->client; + + spin_lock(&client->bos_lock); + list_del_rcu(&bo->client_link); + spin_unlock(&client->bos_lock); + + xe_drm_client_put(client); +} + +static void bo_meminfo(struct xe_bo *bo, + struct drm_memory_stats stats[TTM_NUM_MEM_TYPES]) +{ + u64 sz = bo->size; + u32 mem_type; + + if (bo->placement.placement) + mem_type = bo->placement.placement->mem_type; + else + mem_type = XE_PL_TT; + + if (bo->ttm.base.handle_count > 1) + stats[mem_type].shared += sz; + else + stats[mem_type].private += sz; + + if (xe_bo_has_pages(bo)) { + stats[mem_type].resident += sz; + + if (!dma_resv_test_signaled(bo->ttm.base.resv, + DMA_RESV_USAGE_BOOKKEEP)) + stats[mem_type].active += sz; + else if (mem_type == XE_PL_SYSTEM) + stats[mem_type].purgeable += sz; + } +} + +static void show_meminfo(struct drm_printer *p, struct drm_file *file) +{ + static const char *const mem_type_to_name[TTM_NUM_MEM_TYPES] = { + [XE_PL_SYSTEM] = "system", + [XE_PL_TT] = "gtt", + [XE_PL_VRAM0] = "vram0", + [XE_PL_VRAM1] = "vram1", + [4 ... 6] = NULL, + [XE_PL_STOLEN] = "stolen" + }; + struct drm_memory_stats stats[TTM_NUM_MEM_TYPES] = {}; + struct xe_file *xef = file->driver_priv; + struct ttm_device *bdev = &xef->xe->ttm; + struct ttm_resource_manager *man; + struct xe_drm_client *client; + struct drm_gem_object *obj; + struct xe_bo *bo; + unsigned int id; + u32 mem_type; + + client = xef->client; + + /* Public objects. */ + spin_lock(&file->table_lock); + idr_for_each_entry(&file->object_idr, obj, id) { + struct xe_bo *bo = gem_to_xe_bo(obj); + + bo_meminfo(bo, stats); + } + spin_unlock(&file->table_lock); + + /* Internal objects. */ + spin_lock(&client->bos_lock); + list_for_each_entry_rcu(bo, &client->bos_list, client_link) { + if (!bo || !kref_get_unless_zero(&bo->ttm.base.refcount)) + continue; + bo_meminfo(bo, stats); + xe_bo_put(bo); + } + spin_unlock(&client->bos_lock); + + for (mem_type = XE_PL_SYSTEM; mem_type < TTM_NUM_MEM_TYPES; ++mem_type) { + if (!mem_type_to_name[mem_type]) + continue; + + man = ttm_manager_type(bdev, mem_type); + + if (man) { + drm_print_memory_stats(p, + &stats[mem_type], + DRM_GEM_OBJECT_RESIDENT | + (mem_type != XE_PL_SYSTEM ? 0 : + DRM_GEM_OBJECT_PURGEABLE), + mem_type_to_name[mem_type]); + } + } +} + +/** + * xe_drm_client_fdinfo() - Callback for fdinfo interface + * @p: The drm_printer ptr + * @file: The drm_file ptr + * + * This is callabck for drm fdinfo interface. Register this callback + * in drm driver ops for show_fdinfo. + * + * Return: void + */ +void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file) +{ + show_meminfo(p, file); +} +#endif diff --git a/drivers/gpu/drm/xe/xe_drm_client.h b/drivers/gpu/drm/xe/xe_drm_client.h new file mode 100644 index 000000000000..a9649aa36011 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_client.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_DRM_CLIENT_H_ +#define _XE_DRM_CLIENT_H_ + +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/pid.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/spinlock.h> + +struct drm_file; +struct drm_printer; +struct xe_bo; + +struct xe_drm_client { + struct kref kref; + unsigned int id; +#ifdef CONFIG_PROC_FS + /** + * @bos_lock: lock protecting @bos_list + */ + spinlock_t bos_lock; + /** + * @bos_list: list of bos created by this client + * + * Protected by @bos_lock. + */ + struct list_head bos_list; +#endif +}; + + static inline struct xe_drm_client * +xe_drm_client_get(struct xe_drm_client *client) +{ + kref_get(&client->kref); + return client; +} + +void __xe_drm_client_free(struct kref *kref); + +static inline void xe_drm_client_put(struct xe_drm_client *client) +{ + kref_put(&client->kref, __xe_drm_client_free); +} + +struct xe_drm_client *xe_drm_client_alloc(void); +static inline struct xe_drm_client * +xe_drm_client_get(struct xe_drm_client *client); +static inline void xe_drm_client_put(struct xe_drm_client *client); +#ifdef CONFIG_PROC_FS +void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file); +void xe_drm_client_add_bo(struct xe_drm_client *client, + struct xe_bo *bo); +void xe_drm_client_remove_bo(struct xe_bo *bo); +#else +static inline void xe_drm_client_add_bo(struct xe_drm_client *client, + struct xe_bo *bo) +{ +} + +static inline void xe_drm_client_remove_bo(struct xe_bo *bo) +{ +} +#endif +#endif diff --git a/drivers/gpu/drm/xe/xe_drv.h b/drivers/gpu/drm/xe/xe_drv.h new file mode 100644 index 000000000000..d45b71426cc8 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drv.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_DRV_H_ +#define _XE_DRV_H_ + +#include <drm/drm_drv.h> + +#define DRIVER_NAME "xe" +#define DRIVER_DESC "Intel Xe Graphics" +#define DRIVER_DATE "20201103" + +/* Interface history: + * + * 1.1: Original. + */ +#define DRIVER_MAJOR 1 +#define DRIVER_MINOR 1 +#define DRIVER_PATCHLEVEL 0 + +#endif diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c new file mode 100644 index 000000000000..d30c0d0689bc --- /dev/null +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_exec.h" + +#include <drm/drm_device.h> +#include <drm/drm_exec.h> +#include <drm/drm_file.h> +#include <drm/xe_drm.h> +#include <linux/delay.h> + +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_exec_queue.h" +#include "xe_macros.h" +#include "xe_ring_ops_types.h" +#include "xe_sched_job.h" +#include "xe_sync.h" +#include "xe_vm.h" + +/** + * DOC: Execbuf (User GPU command submission) + * + * Execs have historically been rather complicated in DRM drivers (at least in + * the i915) because a few things: + * + * - Passing in a list BO which are read / written to creating implicit syncs + * - Binding at exec time + * - Flow controlling the ring at exec time + * + * In XE we avoid all of this complication by not allowing a BO list to be + * passed into an exec, using the dma-buf implicit sync uAPI, have binds as + * seperate operations, and using the DRM scheduler to flow control the ring. + * Let's deep dive on each of these. + * + * We can get away from a BO list by forcing the user to use in / out fences on + * every exec rather than the kernel tracking dependencies of BO (e.g. if the + * user knows an exec writes to a BO and reads from the BO in the next exec, it + * is the user's responsibility to pass in / out fence between the two execs). + * + * Implicit dependencies for external BOs are handled by using the dma-buf + * implicit dependency uAPI (TODO: add link). To make this works each exec must + * install the job's fence into the DMA_RESV_USAGE_WRITE slot of every external + * BO mapped in the VM. + * + * We do not allow a user to trigger a bind at exec time rather we have a VM + * bind IOCTL which uses the same in / out fence interface as exec. In that + * sense, a VM bind is basically the same operation as an exec from the user + * perspective. e.g. If an exec depends on a VM bind use the in / out fence + * interface (struct drm_xe_sync) to synchronize like syncing between two + * dependent execs. + * + * Although a user cannot trigger a bind, we still have to rebind userptrs in + * the VM that have been invalidated since the last exec, likewise we also have + * to rebind BOs that have been evicted by the kernel. We schedule these rebinds + * behind any pending kernel operations on any external BOs in VM or any BOs + * private to the VM. This is accomplished by the rebinds waiting on BOs + * DMA_RESV_USAGE_KERNEL slot (kernel ops) and kernel ops waiting on all BOs + * slots (inflight execs are in the DMA_RESV_USAGE_BOOKING for private BOs and + * in DMA_RESV_USAGE_WRITE for external BOs). + * + * Rebinds / dma-resv usage applies to non-compute mode VMs only as for compute + * mode VMs we use preempt fences and a rebind worker (TODO: add link). + * + * There is no need to flow control the ring in the exec as we write the ring at + * submission time and set the DRM scheduler max job limit SIZE_OF_RING / + * MAX_JOB_SIZE. The DRM scheduler will then hold all jobs until space in the + * ring is available. + * + * All of this results in a rather simple exec implementation. + * + * Flow + * ~~~~ + * + * .. code-block:: + * + * Parse input arguments + * Wait for any async VM bind passed as in-fences to start + * <----------------------------------------------------------------------| + * Lock global VM lock in read mode | + * Pin userptrs (also finds userptr invalidated since last exec) | + * Lock exec (VM dma-resv lock, external BOs dma-resv locks) | + * Validate BOs that have been evicted | + * Create job | + * Rebind invalidated userptrs + evicted BOs (non-compute-mode) | + * Add rebind fence dependency to job | + * Add job VM dma-resv bookkeeping slot (non-compute mode) | + * Add job to external BOs dma-resv write slots (non-compute mode) | + * Check if any userptrs invalidated since pin ------ Drop locks ---------| + * Install in / out fences for job + * Submit job + * Unlock all + */ + +static int xe_exec_fn(struct drm_gpuvm_exec *vm_exec) +{ + return drm_gpuvm_validate(vm_exec->vm, &vm_exec->exec); +} + +int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = to_xe_file(file); + struct drm_xe_exec *args = data; + struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs); + u64 __user *addresses_user = u64_to_user_ptr(args->address); + struct xe_exec_queue *q; + struct xe_sync_entry *syncs = NULL; + u64 addresses[XE_HW_ENGINE_MAX_INSTANCE]; + struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn}; + struct drm_exec *exec = &vm_exec.exec; + u32 i, num_syncs = 0; + struct xe_sched_job *job; + struct dma_fence *rebind_fence; + struct xe_vm *vm; + bool write_locked; + ktime_t end = 0; + int err = 0; + + if (XE_IOCTL_DBG(xe, args->extensions) || + XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) || + XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + q = xe_exec_queue_lookup(xef, args->exec_queue_id); + if (XE_IOCTL_DBG(xe, !q)) + return -ENOENT; + + if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_VM)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->num_batch_buffer && + q->width != args->num_batch_buffer)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_BANNED)) { + err = -ECANCELED; + goto err_exec_queue; + } + + if (args->num_syncs) { + syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL); + if (!syncs) { + err = -ENOMEM; + goto err_exec_queue; + } + } + + vm = q->vm; + + for (i = 0; i < args->num_syncs; i++) { + err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++], + &syncs_user[i], SYNC_PARSE_FLAG_EXEC | + (xe_vm_in_lr_mode(vm) ? + SYNC_PARSE_FLAG_LR_MODE : 0)); + if (err) + goto err_syncs; + } + + if (xe_exec_queue_is_parallel(q)) { + err = __copy_from_user(addresses, addresses_user, sizeof(u64) * + q->width); + if (err) { + err = -EFAULT; + goto err_syncs; + } + } + +retry: + if (!xe_vm_in_lr_mode(vm) && xe_vm_userptr_check_repin(vm)) { + err = down_write_killable(&vm->lock); + write_locked = true; + } else { + /* We don't allow execs while the VM is in error state */ + err = down_read_interruptible(&vm->lock); + write_locked = false; + } + if (err) + goto err_syncs; + + if (write_locked) { + err = xe_vm_userptr_pin(vm); + downgrade_write(&vm->lock); + write_locked = false; + if (err) + goto err_unlock_list; + } + + vm_exec.vm = &vm->gpuvm; + vm_exec.num_fences = 1 + vm->xe->info.tile_count; + vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; + if (xe_vm_in_lr_mode(vm)) { + drm_exec_init(exec, vm_exec.flags, 0); + } else { + err = drm_gpuvm_exec_lock(&vm_exec); + if (err) { + if (xe_vm_validate_should_retry(exec, err, &end)) + err = -EAGAIN; + goto err_unlock_list; + } + } + + if (xe_vm_is_closed_or_banned(q->vm)) { + drm_warn(&xe->drm, "Trying to schedule after vm is closed or banned\n"); + err = -ECANCELED; + goto err_exec; + } + + if (!args->num_batch_buffer) { + if (!xe_vm_in_lr_mode(vm)) { + struct dma_fence *fence; + + fence = xe_sync_in_fence_get(syncs, num_syncs, q, vm); + if (IS_ERR(fence)) { + err = PTR_ERR(fence); + goto err_exec; + } + for (i = 0; i < num_syncs; i++) + xe_sync_entry_signal(&syncs[i], NULL, fence); + xe_exec_queue_last_fence_set(q, vm, fence); + dma_fence_put(fence); + } + + goto err_exec; + } + + if (xe_exec_queue_is_lr(q) && xe_exec_queue_ring_full(q)) { + err = -EWOULDBLOCK; + goto err_exec; + } + + job = xe_sched_job_create(q, xe_exec_queue_is_parallel(q) ? + addresses : &args->address); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto err_exec; + } + + /* + * Rebind any invalidated userptr or evicted BOs in the VM, non-compute + * VM mode only. + */ + rebind_fence = xe_vm_rebind(vm, false); + if (IS_ERR(rebind_fence)) { + err = PTR_ERR(rebind_fence); + goto err_put_job; + } + + /* + * We store the rebind_fence in the VM so subsequent execs don't get + * scheduled before the rebinds of userptrs / evicted BOs is complete. + */ + if (rebind_fence) { + dma_fence_put(vm->rebind_fence); + vm->rebind_fence = rebind_fence; + } + if (vm->rebind_fence) { + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, + &vm->rebind_fence->flags)) { + dma_fence_put(vm->rebind_fence); + vm->rebind_fence = NULL; + } else { + dma_fence_get(vm->rebind_fence); + err = drm_sched_job_add_dependency(&job->drm, + vm->rebind_fence); + if (err) + goto err_put_job; + } + } + + /* Wait behind munmap style rebinds */ + if (!xe_vm_in_lr_mode(vm)) { + err = drm_sched_job_add_resv_dependencies(&job->drm, + xe_vm_resv(vm), + DMA_RESV_USAGE_KERNEL); + if (err) + goto err_put_job; + } + + for (i = 0; i < num_syncs && !err; i++) + err = xe_sync_entry_add_deps(&syncs[i], job); + if (err) + goto err_put_job; + + if (!xe_vm_in_lr_mode(vm)) { + err = xe_sched_job_last_fence_add_dep(job, vm); + if (err) + goto err_put_job; + + err = down_read_interruptible(&vm->userptr.notifier_lock); + if (err) + goto err_put_job; + + err = __xe_vm_userptr_needs_repin(vm); + if (err) + goto err_repin; + } + + /* + * Point of no return, if we error after this point just set an error on + * the job and let the DRM scheduler / backend clean up the job. + */ + xe_sched_job_arm(job); + if (!xe_vm_in_lr_mode(vm)) + drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, &job->drm.s_fence->finished, + DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_WRITE); + + for (i = 0; i < num_syncs; i++) + xe_sync_entry_signal(&syncs[i], job, + &job->drm.s_fence->finished); + + if (xe_exec_queue_is_lr(q)) + q->ring_ops->emit_job(job); + if (!xe_vm_in_lr_mode(vm)) + xe_exec_queue_last_fence_set(q, vm, &job->drm.s_fence->finished); + xe_sched_job_push(job); + xe_vm_reactivate_rebind(vm); + + if (!err && !xe_vm_in_lr_mode(vm)) { + spin_lock(&xe->ttm.lru_lock); + ttm_lru_bulk_move_tail(&vm->lru_bulk_move); + spin_unlock(&xe->ttm.lru_lock); + } + +err_repin: + if (!xe_vm_in_lr_mode(vm)) + up_read(&vm->userptr.notifier_lock); +err_put_job: + if (err) + xe_sched_job_put(job); +err_exec: + drm_exec_fini(exec); +err_unlock_list: + if (write_locked) + up_write(&vm->lock); + else + up_read(&vm->lock); + if (err == -EAGAIN) + goto retry; +err_syncs: + for (i = 0; i < num_syncs; i++) + xe_sync_entry_cleanup(&syncs[i]); + kfree(syncs); +err_exec_queue: + xe_exec_queue_put(q); + + return err; +} diff --git a/drivers/gpu/drm/xe/xe_exec.h b/drivers/gpu/drm/xe/xe_exec.h new file mode 100644 index 000000000000..e4932494cea3 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_exec.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_EXEC_H_ +#define _XE_EXEC_H_ + +struct drm_device; +struct drm_file; + +int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file); + +#endif diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c new file mode 100644 index 000000000000..44fe8097b7cd --- /dev/null +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -0,0 +1,956 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_exec_queue.h" + +#include <linux/nospec.h> + +#include <drm/drm_device.h> +#include <drm/drm_file.h> +#include <drm/xe_drm.h> + +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_hw_engine_class_sysfs.h" +#include "xe_hw_fence.h" +#include "xe_lrc.h" +#include "xe_macros.h" +#include "xe_migrate.h" +#include "xe_pm.h" +#include "xe_ring_ops_types.h" +#include "xe_trace.h" +#include "xe_vm.h" + +enum xe_exec_queue_sched_prop { + XE_EXEC_QUEUE_JOB_TIMEOUT = 0, + XE_EXEC_QUEUE_TIMESLICE = 1, + XE_EXEC_QUEUE_PREEMPT_TIMEOUT = 2, + XE_EXEC_QUEUE_SCHED_PROP_MAX = 3, +}; + +static struct xe_exec_queue *__xe_exec_queue_create(struct xe_device *xe, + struct xe_vm *vm, + u32 logical_mask, + u16 width, struct xe_hw_engine *hwe, + u32 flags) +{ + struct xe_exec_queue *q; + struct xe_gt *gt = hwe->gt; + int err; + int i; + + /* only kernel queues can be permanent */ + XE_WARN_ON((flags & EXEC_QUEUE_FLAG_PERMANENT) && !(flags & EXEC_QUEUE_FLAG_KERNEL)); + + q = kzalloc(sizeof(*q) + sizeof(struct xe_lrc) * width, GFP_KERNEL); + if (!q) + return ERR_PTR(-ENOMEM); + + kref_init(&q->refcount); + q->flags = flags; + q->hwe = hwe; + q->gt = gt; + if (vm) + q->vm = xe_vm_get(vm); + q->class = hwe->class; + q->width = width; + q->logical_mask = logical_mask; + q->fence_irq = >->fence_irq[hwe->class]; + q->ring_ops = gt->ring_ops[hwe->class]; + q->ops = gt->exec_queue_ops; + INIT_LIST_HEAD(&q->persistent.link); + INIT_LIST_HEAD(&q->compute.link); + INIT_LIST_HEAD(&q->multi_gt_link); + + q->sched_props.timeslice_us = hwe->eclass->sched_props.timeslice_us; + q->sched_props.preempt_timeout_us = + hwe->eclass->sched_props.preempt_timeout_us; + + if (xe_exec_queue_is_parallel(q)) { + q->parallel.composite_fence_ctx = dma_fence_context_alloc(1); + q->parallel.composite_fence_seqno = XE_FENCE_INITIAL_SEQNO; + } + if (q->flags & EXEC_QUEUE_FLAG_VM) { + q->bind.fence_ctx = dma_fence_context_alloc(1); + q->bind.fence_seqno = XE_FENCE_INITIAL_SEQNO; + } + + for (i = 0; i < width; ++i) { + err = xe_lrc_init(q->lrc + i, hwe, q, vm, SZ_16K); + if (err) + goto err_lrc; + } + + err = q->ops->init(q); + if (err) + goto err_lrc; + + /* + * Normally the user vm holds an rpm ref to keep the device + * awake, and the context holds a ref for the vm, however for + * some engines we use the kernels migrate vm underneath which offers no + * such rpm ref, or we lack a vm. Make sure we keep a ref here, so we + * can perform GuC CT actions when needed. Caller is expected to have + * already grabbed the rpm ref outside any sensitive locks. + */ + if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & EXEC_QUEUE_FLAG_VM || !vm)) + drm_WARN_ON(&xe->drm, !xe_device_mem_access_get_if_ongoing(xe)); + + return q; + +err_lrc: + for (i = i - 1; i >= 0; --i) + xe_lrc_finish(q->lrc + i); + kfree(q); + return ERR_PTR(err); +} + +struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm, + u32 logical_mask, u16 width, + struct xe_hw_engine *hwe, u32 flags) +{ + struct xe_exec_queue *q; + int err; + + if (vm) { + err = xe_vm_lock(vm, true); + if (err) + return ERR_PTR(err); + } + q = __xe_exec_queue_create(xe, vm, logical_mask, width, hwe, flags); + if (vm) + xe_vm_unlock(vm); + + return q; +} + +struct xe_exec_queue *xe_exec_queue_create_class(struct xe_device *xe, struct xe_gt *gt, + struct xe_vm *vm, + enum xe_engine_class class, u32 flags) +{ + struct xe_hw_engine *hwe, *hwe0 = NULL; + enum xe_hw_engine_id id; + u32 logical_mask = 0; + + for_each_hw_engine(hwe, gt, id) { + if (xe_hw_engine_is_reserved(hwe)) + continue; + + if (hwe->class == class) { + logical_mask |= BIT(hwe->logical_instance); + if (!hwe0) + hwe0 = hwe; + } + } + + if (!logical_mask) + return ERR_PTR(-ENODEV); + + return xe_exec_queue_create(xe, vm, logical_mask, 1, hwe0, flags); +} + +void xe_exec_queue_destroy(struct kref *ref) +{ + struct xe_exec_queue *q = container_of(ref, struct xe_exec_queue, refcount); + struct xe_exec_queue *eq, *next; + + xe_exec_queue_last_fence_put_unlocked(q); + if (!(q->flags & EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD)) { + list_for_each_entry_safe(eq, next, &q->multi_gt_list, + multi_gt_link) + xe_exec_queue_put(eq); + } + + q->ops->fini(q); +} + +void xe_exec_queue_fini(struct xe_exec_queue *q) +{ + int i; + + for (i = 0; i < q->width; ++i) + xe_lrc_finish(q->lrc + i); + if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && (q->flags & EXEC_QUEUE_FLAG_VM || !q->vm)) + xe_device_mem_access_put(gt_to_xe(q->gt)); + if (q->vm) + xe_vm_put(q->vm); + + kfree(q); +} + +void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance) +{ + switch (q->class) { + case XE_ENGINE_CLASS_RENDER: + sprintf(q->name, "rcs%d", instance); + break; + case XE_ENGINE_CLASS_VIDEO_DECODE: + sprintf(q->name, "vcs%d", instance); + break; + case XE_ENGINE_CLASS_VIDEO_ENHANCE: + sprintf(q->name, "vecs%d", instance); + break; + case XE_ENGINE_CLASS_COPY: + sprintf(q->name, "bcs%d", instance); + break; + case XE_ENGINE_CLASS_COMPUTE: + sprintf(q->name, "ccs%d", instance); + break; + case XE_ENGINE_CLASS_OTHER: + sprintf(q->name, "gsccs%d", instance); + break; + default: + XE_WARN_ON(q->class); + } +} + +struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id) +{ + struct xe_exec_queue *q; + + mutex_lock(&xef->exec_queue.lock); + q = xa_load(&xef->exec_queue.xa, id); + if (q) + xe_exec_queue_get(q); + mutex_unlock(&xef->exec_queue.lock); + + return q; +} + +enum xe_exec_queue_priority +xe_exec_queue_device_get_max_priority(struct xe_device *xe) +{ + return capable(CAP_SYS_NICE) ? XE_EXEC_QUEUE_PRIORITY_HIGH : + XE_EXEC_QUEUE_PRIORITY_NORMAL; +} + +static int exec_queue_set_priority(struct xe_device *xe, struct xe_exec_queue *q, + u64 value, bool create) +{ + if (XE_IOCTL_DBG(xe, value > XE_EXEC_QUEUE_PRIORITY_HIGH)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, value > xe_exec_queue_device_get_max_priority(xe))) + return -EPERM; + + return q->ops->set_priority(q, value); +} + +static bool xe_exec_queue_enforce_schedule_limit(void) +{ +#if IS_ENABLED(CONFIG_DRM_XE_ENABLE_SCHEDTIMEOUT_LIMIT) + return true; +#else + return !capable(CAP_SYS_NICE); +#endif +} + +static void +xe_exec_queue_get_prop_minmax(struct xe_hw_engine_class_intf *eclass, + enum xe_exec_queue_sched_prop prop, + u32 *min, u32 *max) +{ + switch (prop) { + case XE_EXEC_QUEUE_JOB_TIMEOUT: + *min = eclass->sched_props.job_timeout_min; + *max = eclass->sched_props.job_timeout_max; + break; + case XE_EXEC_QUEUE_TIMESLICE: + *min = eclass->sched_props.timeslice_min; + *max = eclass->sched_props.timeslice_max; + break; + case XE_EXEC_QUEUE_PREEMPT_TIMEOUT: + *min = eclass->sched_props.preempt_timeout_min; + *max = eclass->sched_props.preempt_timeout_max; + break; + default: + break; + } +#if IS_ENABLED(CONFIG_DRM_XE_ENABLE_SCHEDTIMEOUT_LIMIT) + if (capable(CAP_SYS_NICE)) { + switch (prop) { + case XE_EXEC_QUEUE_JOB_TIMEOUT: + *min = XE_HW_ENGINE_JOB_TIMEOUT_MIN; + *max = XE_HW_ENGINE_JOB_TIMEOUT_MAX; + break; + case XE_EXEC_QUEUE_TIMESLICE: + *min = XE_HW_ENGINE_TIMESLICE_MIN; + *max = XE_HW_ENGINE_TIMESLICE_MAX; + break; + case XE_EXEC_QUEUE_PREEMPT_TIMEOUT: + *min = XE_HW_ENGINE_PREEMPT_TIMEOUT_MIN; + *max = XE_HW_ENGINE_PREEMPT_TIMEOUT_MAX; + break; + default: + break; + } + } +#endif +} + +static int exec_queue_set_timeslice(struct xe_device *xe, struct xe_exec_queue *q, + u64 value, bool create) +{ + u32 min = 0, max = 0; + + xe_exec_queue_get_prop_minmax(q->hwe->eclass, + XE_EXEC_QUEUE_TIMESLICE, &min, &max); + + if (xe_exec_queue_enforce_schedule_limit() && + !xe_hw_engine_timeout_in_range(value, min, max)) + return -EINVAL; + + return q->ops->set_timeslice(q, value); +} + +static int exec_queue_set_preemption_timeout(struct xe_device *xe, + struct xe_exec_queue *q, u64 value, + bool create) +{ + u32 min = 0, max = 0; + + xe_exec_queue_get_prop_minmax(q->hwe->eclass, + XE_EXEC_QUEUE_PREEMPT_TIMEOUT, &min, &max); + + if (xe_exec_queue_enforce_schedule_limit() && + !xe_hw_engine_timeout_in_range(value, min, max)) + return -EINVAL; + + return q->ops->set_preempt_timeout(q, value); +} + +static int exec_queue_set_persistence(struct xe_device *xe, struct xe_exec_queue *q, + u64 value, bool create) +{ + if (XE_IOCTL_DBG(xe, !create)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, xe_vm_in_preempt_fence_mode(q->vm))) + return -EINVAL; + + if (value) + q->flags |= EXEC_QUEUE_FLAG_PERSISTENT; + else + q->flags &= ~EXEC_QUEUE_FLAG_PERSISTENT; + + return 0; +} + +static int exec_queue_set_job_timeout(struct xe_device *xe, struct xe_exec_queue *q, + u64 value, bool create) +{ + u32 min = 0, max = 0; + + if (XE_IOCTL_DBG(xe, !create)) + return -EINVAL; + + xe_exec_queue_get_prop_minmax(q->hwe->eclass, + XE_EXEC_QUEUE_JOB_TIMEOUT, &min, &max); + + if (xe_exec_queue_enforce_schedule_limit() && + !xe_hw_engine_timeout_in_range(value, min, max)) + return -EINVAL; + + return q->ops->set_job_timeout(q, value); +} + +static int exec_queue_set_acc_trigger(struct xe_device *xe, struct xe_exec_queue *q, + u64 value, bool create) +{ + if (XE_IOCTL_DBG(xe, !create)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, !xe->info.has_usm)) + return -EINVAL; + + q->usm.acc_trigger = value; + + return 0; +} + +static int exec_queue_set_acc_notify(struct xe_device *xe, struct xe_exec_queue *q, + u64 value, bool create) +{ + if (XE_IOCTL_DBG(xe, !create)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, !xe->info.has_usm)) + return -EINVAL; + + q->usm.acc_notify = value; + + return 0; +} + +static int exec_queue_set_acc_granularity(struct xe_device *xe, struct xe_exec_queue *q, + u64 value, bool create) +{ + if (XE_IOCTL_DBG(xe, !create)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, !xe->info.has_usm)) + return -EINVAL; + + if (value > DRM_XE_ACC_GRANULARITY_64M) + return -EINVAL; + + q->usm.acc_granularity = value; + + return 0; +} + +typedef int (*xe_exec_queue_set_property_fn)(struct xe_device *xe, + struct xe_exec_queue *q, + u64 value, bool create); + +static const xe_exec_queue_set_property_fn exec_queue_set_property_funcs[] = { + [DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY] = exec_queue_set_priority, + [DRM_XE_EXEC_QUEUE_SET_PROPERTY_TIMESLICE] = exec_queue_set_timeslice, + [DRM_XE_EXEC_QUEUE_SET_PROPERTY_PREEMPTION_TIMEOUT] = exec_queue_set_preemption_timeout, + [DRM_XE_EXEC_QUEUE_SET_PROPERTY_PERSISTENCE] = exec_queue_set_persistence, + [DRM_XE_EXEC_QUEUE_SET_PROPERTY_JOB_TIMEOUT] = exec_queue_set_job_timeout, + [DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_TRIGGER] = exec_queue_set_acc_trigger, + [DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_NOTIFY] = exec_queue_set_acc_notify, + [DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_GRANULARITY] = exec_queue_set_acc_granularity, +}; + +static int exec_queue_user_ext_set_property(struct xe_device *xe, + struct xe_exec_queue *q, + u64 extension, + bool create) +{ + u64 __user *address = u64_to_user_ptr(extension); + struct drm_xe_ext_set_property ext; + int err; + u32 idx; + + err = __copy_from_user(&ext, address, sizeof(ext)); + if (XE_IOCTL_DBG(xe, err)) + return -EFAULT; + + if (XE_IOCTL_DBG(xe, ext.property >= + ARRAY_SIZE(exec_queue_set_property_funcs)) || + XE_IOCTL_DBG(xe, ext.pad)) + return -EINVAL; + + idx = array_index_nospec(ext.property, ARRAY_SIZE(exec_queue_set_property_funcs)); + return exec_queue_set_property_funcs[idx](xe, q, ext.value, create); +} + +typedef int (*xe_exec_queue_user_extension_fn)(struct xe_device *xe, + struct xe_exec_queue *q, + u64 extension, + bool create); + +static const xe_exec_queue_set_property_fn exec_queue_user_extension_funcs[] = { + [DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY] = exec_queue_user_ext_set_property, +}; + +#define MAX_USER_EXTENSIONS 16 +static int exec_queue_user_extensions(struct xe_device *xe, struct xe_exec_queue *q, + u64 extensions, int ext_number, bool create) +{ + u64 __user *address = u64_to_user_ptr(extensions); + struct drm_xe_user_extension ext; + int err; + u32 idx; + + if (XE_IOCTL_DBG(xe, ext_number >= MAX_USER_EXTENSIONS)) + return -E2BIG; + + err = __copy_from_user(&ext, address, sizeof(ext)); + if (XE_IOCTL_DBG(xe, err)) + return -EFAULT; + + if (XE_IOCTL_DBG(xe, ext.pad) || + XE_IOCTL_DBG(xe, ext.name >= + ARRAY_SIZE(exec_queue_user_extension_funcs))) + return -EINVAL; + + idx = array_index_nospec(ext.name, + ARRAY_SIZE(exec_queue_user_extension_funcs)); + err = exec_queue_user_extension_funcs[idx](xe, q, extensions, create); + if (XE_IOCTL_DBG(xe, err)) + return err; + + if (ext.next_extension) + return exec_queue_user_extensions(xe, q, ext.next_extension, + ++ext_number, create); + + return 0; +} + +static const enum xe_engine_class user_to_xe_engine_class[] = { + [DRM_XE_ENGINE_CLASS_RENDER] = XE_ENGINE_CLASS_RENDER, + [DRM_XE_ENGINE_CLASS_COPY] = XE_ENGINE_CLASS_COPY, + [DRM_XE_ENGINE_CLASS_VIDEO_DECODE] = XE_ENGINE_CLASS_VIDEO_DECODE, + [DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE] = XE_ENGINE_CLASS_VIDEO_ENHANCE, + [DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE, +}; + +static struct xe_hw_engine * +find_hw_engine(struct xe_device *xe, + struct drm_xe_engine_class_instance eci) +{ + u32 idx; + + if (eci.engine_class > ARRAY_SIZE(user_to_xe_engine_class)) + return NULL; + + if (eci.gt_id >= xe->info.gt_count) + return NULL; + + idx = array_index_nospec(eci.engine_class, + ARRAY_SIZE(user_to_xe_engine_class)); + + return xe_gt_hw_engine(xe_device_get_gt(xe, eci.gt_id), + user_to_xe_engine_class[idx], + eci.engine_instance, true); +} + +static u32 bind_exec_queue_logical_mask(struct xe_device *xe, struct xe_gt *gt, + struct drm_xe_engine_class_instance *eci, + u16 width, u16 num_placements) +{ + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + u32 logical_mask = 0; + + if (XE_IOCTL_DBG(xe, width != 1)) + return 0; + if (XE_IOCTL_DBG(xe, num_placements != 1)) + return 0; + if (XE_IOCTL_DBG(xe, eci[0].engine_instance != 0)) + return 0; + + eci[0].engine_class = DRM_XE_ENGINE_CLASS_COPY; + + for_each_hw_engine(hwe, gt, id) { + if (xe_hw_engine_is_reserved(hwe)) + continue; + + if (hwe->class == + user_to_xe_engine_class[DRM_XE_ENGINE_CLASS_COPY]) + logical_mask |= BIT(hwe->logical_instance); + } + + return logical_mask; +} + +static u32 calc_validate_logical_mask(struct xe_device *xe, struct xe_gt *gt, + struct drm_xe_engine_class_instance *eci, + u16 width, u16 num_placements) +{ + int len = width * num_placements; + int i, j, n; + u16 class; + u16 gt_id; + u32 return_mask = 0, prev_mask; + + if (XE_IOCTL_DBG(xe, !xe_device_uc_enabled(xe) && + len > 1)) + return 0; + + for (i = 0; i < width; ++i) { + u32 current_mask = 0; + + for (j = 0; j < num_placements; ++j) { + struct xe_hw_engine *hwe; + + n = j * width + i; + + hwe = find_hw_engine(xe, eci[n]); + if (XE_IOCTL_DBG(xe, !hwe)) + return 0; + + if (XE_IOCTL_DBG(xe, xe_hw_engine_is_reserved(hwe))) + return 0; + + if (XE_IOCTL_DBG(xe, n && eci[n].gt_id != gt_id) || + XE_IOCTL_DBG(xe, n && eci[n].engine_class != class)) + return 0; + + class = eci[n].engine_class; + gt_id = eci[n].gt_id; + + if (width == 1 || !i) + return_mask |= BIT(eci[n].engine_instance); + current_mask |= BIT(eci[n].engine_instance); + } + + /* Parallel submissions must be logically contiguous */ + if (i && XE_IOCTL_DBG(xe, current_mask != prev_mask << 1)) + return 0; + + prev_mask = current_mask; + } + + return return_mask; +} + +int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = to_xe_file(file); + struct drm_xe_exec_queue_create *args = data; + struct drm_xe_engine_class_instance eci[XE_HW_ENGINE_MAX_INSTANCE]; + struct drm_xe_engine_class_instance __user *user_eci = + u64_to_user_ptr(args->instances); + struct xe_hw_engine *hwe; + struct xe_vm *vm, *migrate_vm; + struct xe_gt *gt; + struct xe_exec_queue *q = NULL; + u32 logical_mask; + u32 id; + u32 len; + int err; + + if (XE_IOCTL_DBG(xe, args->flags) || + XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + len = args->width * args->num_placements; + if (XE_IOCTL_DBG(xe, !len || len > XE_HW_ENGINE_MAX_INSTANCE)) + return -EINVAL; + + err = __copy_from_user(eci, user_eci, + sizeof(struct drm_xe_engine_class_instance) * + len); + if (XE_IOCTL_DBG(xe, err)) + return -EFAULT; + + if (XE_IOCTL_DBG(xe, eci[0].gt_id >= xe->info.gt_count)) + return -EINVAL; + + if (eci[0].engine_class == DRM_XE_ENGINE_CLASS_VM_BIND) { + for_each_gt(gt, xe, id) { + struct xe_exec_queue *new; + + if (xe_gt_is_media_type(gt)) + continue; + + eci[0].gt_id = gt->info.id; + logical_mask = bind_exec_queue_logical_mask(xe, gt, eci, + args->width, + args->num_placements); + if (XE_IOCTL_DBG(xe, !logical_mask)) + return -EINVAL; + + hwe = find_hw_engine(xe, eci[0]); + if (XE_IOCTL_DBG(xe, !hwe)) + return -EINVAL; + + /* The migration vm doesn't hold rpm ref */ + xe_device_mem_access_get(xe); + + migrate_vm = xe_migrate_get_vm(gt_to_tile(gt)->migrate); + new = xe_exec_queue_create(xe, migrate_vm, logical_mask, + args->width, hwe, + EXEC_QUEUE_FLAG_PERSISTENT | + EXEC_QUEUE_FLAG_VM | + (id ? + EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD : + 0)); + + xe_device_mem_access_put(xe); /* now held by engine */ + + xe_vm_put(migrate_vm); + if (IS_ERR(new)) { + err = PTR_ERR(new); + if (q) + goto put_exec_queue; + return err; + } + if (id == 0) + q = new; + else + list_add_tail(&new->multi_gt_list, + &q->multi_gt_link); + } + } else { + gt = xe_device_get_gt(xe, eci[0].gt_id); + logical_mask = calc_validate_logical_mask(xe, gt, eci, + args->width, + args->num_placements); + if (XE_IOCTL_DBG(xe, !logical_mask)) + return -EINVAL; + + hwe = find_hw_engine(xe, eci[0]); + if (XE_IOCTL_DBG(xe, !hwe)) + return -EINVAL; + + vm = xe_vm_lookup(xef, args->vm_id); + if (XE_IOCTL_DBG(xe, !vm)) + return -ENOENT; + + err = down_read_interruptible(&vm->lock); + if (err) { + xe_vm_put(vm); + return err; + } + + if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) { + up_read(&vm->lock); + xe_vm_put(vm); + return -ENOENT; + } + + q = xe_exec_queue_create(xe, vm, logical_mask, + args->width, hwe, + xe_vm_in_lr_mode(vm) ? 0 : + EXEC_QUEUE_FLAG_PERSISTENT); + up_read(&vm->lock); + xe_vm_put(vm); + if (IS_ERR(q)) + return PTR_ERR(q); + + if (xe_vm_in_preempt_fence_mode(vm)) { + q->compute.context = dma_fence_context_alloc(1); + spin_lock_init(&q->compute.lock); + + err = xe_vm_add_compute_exec_queue(vm, q); + if (XE_IOCTL_DBG(xe, err)) + goto put_exec_queue; + } + } + + if (args->extensions) { + err = exec_queue_user_extensions(xe, q, args->extensions, 0, true); + if (XE_IOCTL_DBG(xe, err)) + goto kill_exec_queue; + } + + q->persistent.xef = xef; + + mutex_lock(&xef->exec_queue.lock); + err = xa_alloc(&xef->exec_queue.xa, &id, q, xa_limit_32b, GFP_KERNEL); + mutex_unlock(&xef->exec_queue.lock); + if (err) + goto kill_exec_queue; + + args->exec_queue_id = id; + + return 0; + +kill_exec_queue: + xe_exec_queue_kill(q); +put_exec_queue: + xe_exec_queue_put(q); + return err; +} + +int xe_exec_queue_get_property_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = to_xe_file(file); + struct drm_xe_exec_queue_get_property *args = data; + struct xe_exec_queue *q; + int ret; + + if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + q = xe_exec_queue_lookup(xef, args->exec_queue_id); + if (XE_IOCTL_DBG(xe, !q)) + return -ENOENT; + + switch (args->property) { + case DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN: + args->value = !!(q->flags & EXEC_QUEUE_FLAG_BANNED); + ret = 0; + break; + default: + ret = -EINVAL; + } + + xe_exec_queue_put(q); + + return ret; +} + +/** + * xe_exec_queue_is_lr() - Whether an exec_queue is long-running + * @q: The exec_queue + * + * Return: True if the exec_queue is long-running, false otherwise. + */ +bool xe_exec_queue_is_lr(struct xe_exec_queue *q) +{ + return q->vm && xe_vm_in_lr_mode(q->vm) && + !(q->flags & EXEC_QUEUE_FLAG_VM); +} + +static s32 xe_exec_queue_num_job_inflight(struct xe_exec_queue *q) +{ + return q->lrc->fence_ctx.next_seqno - xe_lrc_seqno(q->lrc) - 1; +} + +/** + * xe_exec_queue_ring_full() - Whether an exec_queue's ring is full + * @q: The exec_queue + * + * Return: True if the exec_queue's ring is full, false otherwise. + */ +bool xe_exec_queue_ring_full(struct xe_exec_queue *q) +{ + struct xe_lrc *lrc = q->lrc; + s32 max_job = lrc->ring.size / MAX_JOB_SIZE_BYTES; + + return xe_exec_queue_num_job_inflight(q) >= max_job; +} + +/** + * xe_exec_queue_is_idle() - Whether an exec_queue is idle. + * @q: The exec_queue + * + * FIXME: Need to determine what to use as the short-lived + * timeline lock for the exec_queues, so that the return value + * of this function becomes more than just an advisory + * snapshot in time. The timeline lock must protect the + * seqno from racing submissions on the same exec_queue. + * Typically vm->resv, but user-created timeline locks use the migrate vm + * and never grabs the migrate vm->resv so we have a race there. + * + * Return: True if the exec_queue is idle, false otherwise. + */ +bool xe_exec_queue_is_idle(struct xe_exec_queue *q) +{ + if (xe_exec_queue_is_parallel(q)) { + int i; + + for (i = 0; i < q->width; ++i) { + if (xe_lrc_seqno(&q->lrc[i]) != + q->lrc[i].fence_ctx.next_seqno - 1) + return false; + } + + return true; + } + + return xe_lrc_seqno(&q->lrc[0]) == + q->lrc[0].fence_ctx.next_seqno - 1; +} + +void xe_exec_queue_kill(struct xe_exec_queue *q) +{ + struct xe_exec_queue *eq = q, *next; + + list_for_each_entry_safe(eq, next, &eq->multi_gt_list, + multi_gt_link) { + q->ops->kill(eq); + xe_vm_remove_compute_exec_queue(q->vm, eq); + } + + q->ops->kill(q); + xe_vm_remove_compute_exec_queue(q->vm, q); +} + +int xe_exec_queue_destroy_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = to_xe_file(file); + struct drm_xe_exec_queue_destroy *args = data; + struct xe_exec_queue *q; + + if (XE_IOCTL_DBG(xe, args->pad) || + XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + mutex_lock(&xef->exec_queue.lock); + q = xa_erase(&xef->exec_queue.xa, args->exec_queue_id); + mutex_unlock(&xef->exec_queue.lock); + if (XE_IOCTL_DBG(xe, !q)) + return -ENOENT; + + if (!(q->flags & EXEC_QUEUE_FLAG_PERSISTENT)) + xe_exec_queue_kill(q); + else + xe_device_add_persistent_exec_queues(xe, q); + + trace_xe_exec_queue_close(q); + xe_exec_queue_put(q); + + return 0; +} + +static void xe_exec_queue_last_fence_lockdep_assert(struct xe_exec_queue *q, + struct xe_vm *vm) +{ + if (q->flags & EXEC_QUEUE_FLAG_VM) + lockdep_assert_held(&vm->lock); + else + xe_vm_assert_held(vm); +} + +/** + * xe_exec_queue_last_fence_put() - Drop ref to last fence + * @q: The exec queue + * @vm: The VM the engine does a bind or exec for + */ +void xe_exec_queue_last_fence_put(struct xe_exec_queue *q, struct xe_vm *vm) +{ + xe_exec_queue_last_fence_lockdep_assert(q, vm); + + if (q->last_fence) { + dma_fence_put(q->last_fence); + q->last_fence = NULL; + } +} + +/** + * xe_exec_queue_last_fence_put_unlocked() - Drop ref to last fence unlocked + * @q: The exec queue + * + * Only safe to be called from xe_exec_queue_destroy(). + */ +void xe_exec_queue_last_fence_put_unlocked(struct xe_exec_queue *q) +{ + if (q->last_fence) { + dma_fence_put(q->last_fence); + q->last_fence = NULL; + } +} + +/** + * xe_exec_queue_last_fence_get() - Get last fence + * @q: The exec queue + * @vm: The VM the engine does a bind or exec for + * + * Get last fence, does not take a ref + * + * Returns: last fence if not signaled, dma fence stub if signaled + */ +struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *q, + struct xe_vm *vm) +{ + xe_exec_queue_last_fence_lockdep_assert(q, vm); + + if (q->last_fence && + test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &q->last_fence->flags)) + xe_exec_queue_last_fence_put(q, vm); + + return q->last_fence ? q->last_fence : dma_fence_get_stub(); +} + +/** + * xe_exec_queue_last_fence_set() - Set last fence + * @q: The exec queue + * @vm: The VM the engine does a bind or exec for + * @fence: The fence + * + * Set the last fence for the engine. Increases reference count for fence, when + * closing engine xe_exec_queue_last_fence_put should be called. + */ +void xe_exec_queue_last_fence_set(struct xe_exec_queue *q, struct xe_vm *vm, + struct dma_fence *fence) +{ + xe_exec_queue_last_fence_lockdep_assert(q, vm); + + xe_exec_queue_last_fence_put(q, vm); + q->last_fence = dma_fence_get(fence); +} diff --git a/drivers/gpu/drm/xe/xe_exec_queue.h b/drivers/gpu/drm/xe/xe_exec_queue.h new file mode 100644 index 000000000000..d959cc4a1a82 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_exec_queue.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_EXEC_QUEUE_H_ +#define _XE_EXEC_QUEUE_H_ + +#include "xe_exec_queue_types.h" +#include "xe_vm_types.h" + +struct drm_device; +struct drm_file; +struct xe_device; +struct xe_file; + +struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm, + u32 logical_mask, u16 width, + struct xe_hw_engine *hw_engine, u32 flags); +struct xe_exec_queue *xe_exec_queue_create_class(struct xe_device *xe, struct xe_gt *gt, + struct xe_vm *vm, + enum xe_engine_class class, u32 flags); + +void xe_exec_queue_fini(struct xe_exec_queue *q); +void xe_exec_queue_destroy(struct kref *ref); +void xe_exec_queue_assign_name(struct xe_exec_queue *q, u32 instance); + +struct xe_exec_queue *xe_exec_queue_lookup(struct xe_file *xef, u32 id); + +static inline struct xe_exec_queue *xe_exec_queue_get(struct xe_exec_queue *q) +{ + kref_get(&q->refcount); + return q; +} + +static inline void xe_exec_queue_put(struct xe_exec_queue *q) +{ + kref_put(&q->refcount, xe_exec_queue_destroy); +} + +static inline bool xe_exec_queue_is_parallel(struct xe_exec_queue *q) +{ + return q->width > 1; +} + +bool xe_exec_queue_is_lr(struct xe_exec_queue *q); + +bool xe_exec_queue_ring_full(struct xe_exec_queue *q); + +bool xe_exec_queue_is_idle(struct xe_exec_queue *q); + +void xe_exec_queue_kill(struct xe_exec_queue *q); + +int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +int xe_exec_queue_destroy_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +int xe_exec_queue_get_property_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +enum xe_exec_queue_priority xe_exec_queue_device_get_max_priority(struct xe_device *xe); + +void xe_exec_queue_last_fence_put(struct xe_exec_queue *e, struct xe_vm *vm); +void xe_exec_queue_last_fence_put_unlocked(struct xe_exec_queue *e); +struct dma_fence *xe_exec_queue_last_fence_get(struct xe_exec_queue *e, + struct xe_vm *vm); +void xe_exec_queue_last_fence_set(struct xe_exec_queue *e, struct xe_vm *vm, + struct dma_fence *fence); + +#endif diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h new file mode 100644 index 000000000000..3d7e704ec3d9 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -0,0 +1,222 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_EXEC_QUEUE_TYPES_H_ +#define _XE_EXEC_QUEUE_TYPES_H_ + +#include <linux/kref.h> + +#include <drm/gpu_scheduler.h> + +#include "xe_gpu_scheduler_types.h" +#include "xe_hw_engine_types.h" +#include "xe_hw_fence_types.h" +#include "xe_lrc_types.h" + +struct xe_execlist_exec_queue; +struct xe_gt; +struct xe_guc_exec_queue; +struct xe_hw_engine; +struct xe_vm; + +enum xe_exec_queue_priority { + XE_EXEC_QUEUE_PRIORITY_UNSET = -2, /* For execlist usage only */ + XE_EXEC_QUEUE_PRIORITY_LOW = 0, + XE_EXEC_QUEUE_PRIORITY_NORMAL, + XE_EXEC_QUEUE_PRIORITY_HIGH, + XE_EXEC_QUEUE_PRIORITY_KERNEL, + + XE_EXEC_QUEUE_PRIORITY_COUNT +}; + +/** + * struct xe_exec_queue - Execution queue + * + * Contains all state necessary for submissions. Can either be a user object or + * a kernel object. + */ +struct xe_exec_queue { + /** @gt: graphics tile this exec queue can submit to */ + struct xe_gt *gt; + /** + * @hwe: A hardware of the same class. May (physical engine) or may not + * (virtual engine) be where jobs actual engine up running. Should never + * really be used for submissions. + */ + struct xe_hw_engine *hwe; + /** @refcount: ref count of this exec queue */ + struct kref refcount; + /** @vm: VM (address space) for this exec queue */ + struct xe_vm *vm; + /** @class: class of this exec queue */ + enum xe_engine_class class; + /** @priority: priority of this exec queue */ + enum xe_exec_queue_priority priority; + /** + * @logical_mask: logical mask of where job submitted to exec queue can run + */ + u32 logical_mask; + /** @name: name of this exec queue */ + char name[MAX_FENCE_NAME_LEN]; + /** @width: width (number BB submitted per exec) of this exec queue */ + u16 width; + /** @fence_irq: fence IRQ used to signal job completion */ + struct xe_hw_fence_irq *fence_irq; + + /** + * @last_fence: last fence on exec queue, protected by vm->lock in write + * mode if bind exec queue, protected by dma resv lock if non-bind exec + * queue + */ + struct dma_fence *last_fence; + +/* queue no longer allowed to submit */ +#define EXEC_QUEUE_FLAG_BANNED BIT(0) +/* queue used for kernel submission only */ +#define EXEC_QUEUE_FLAG_KERNEL BIT(1) +/* kernel engine only destroyed at driver unload */ +#define EXEC_QUEUE_FLAG_PERMANENT BIT(2) +/* queue keeps running pending jobs after destroy ioctl */ +#define EXEC_QUEUE_FLAG_PERSISTENT BIT(3) +/* for VM jobs. Caller needs to hold rpm ref when creating queue with this flag */ +#define EXEC_QUEUE_FLAG_VM BIT(4) +/* child of VM queue for multi-tile VM jobs */ +#define EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD BIT(5) + + /** + * @flags: flags for this exec queue, should statically setup aside from ban + * bit + */ + unsigned long flags; + + union { + /** @multi_gt_list: list head for VM bind engines if multi-GT */ + struct list_head multi_gt_list; + /** @multi_gt_link: link for VM bind engines if multi-GT */ + struct list_head multi_gt_link; + }; + + union { + /** @execlist: execlist backend specific state for exec queue */ + struct xe_execlist_exec_queue *execlist; + /** @guc: GuC backend specific state for exec queue */ + struct xe_guc_exec_queue *guc; + }; + + /** + * @persistent: persistent exec queue state + */ + struct { + /** @xef: file which this exec queue belongs to */ + struct xe_file *xef; + /** @link: link in list of persistent exec queues */ + struct list_head link; + } persistent; + + union { + /** + * @parallel: parallel submission state + */ + struct { + /** @composite_fence_ctx: context composite fence */ + u64 composite_fence_ctx; + /** @composite_fence_seqno: seqno for composite fence */ + u32 composite_fence_seqno; + } parallel; + /** + * @bind: bind submission state + */ + struct { + /** @fence_ctx: context bind fence */ + u64 fence_ctx; + /** @fence_seqno: seqno for bind fence */ + u32 fence_seqno; + } bind; + }; + + /** @sched_props: scheduling properties */ + struct { + /** @timeslice_us: timeslice period in micro-seconds */ + u32 timeslice_us; + /** @preempt_timeout_us: preemption timeout in micro-seconds */ + u32 preempt_timeout_us; + } sched_props; + + /** @compute: compute exec queue state */ + struct { + /** @pfence: preemption fence */ + struct dma_fence *pfence; + /** @context: preemption fence context */ + u64 context; + /** @seqno: preemption fence seqno */ + u32 seqno; + /** @link: link into VM's list of exec queues */ + struct list_head link; + /** @lock: preemption fences lock */ + spinlock_t lock; + } compute; + + /** @usm: unified shared memory state */ + struct { + /** @acc_trigger: access counter trigger */ + u32 acc_trigger; + /** @acc_notify: access counter notify */ + u32 acc_notify; + /** @acc_granularity: access counter granularity */ + u32 acc_granularity; + } usm; + + /** @ops: submission backend exec queue operations */ + const struct xe_exec_queue_ops *ops; + + /** @ring_ops: ring operations for this exec queue */ + const struct xe_ring_ops *ring_ops; + /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */ + struct drm_sched_entity *entity; + /** @lrc: logical ring context for this exec queue */ + struct xe_lrc lrc[]; +}; + +/** + * struct xe_exec_queue_ops - Submission backend exec queue operations + */ +struct xe_exec_queue_ops { + /** @init: Initialize exec queue for submission backend */ + int (*init)(struct xe_exec_queue *q); + /** @kill: Kill inflight submissions for backend */ + void (*kill)(struct xe_exec_queue *q); + /** @fini: Fini exec queue for submission backend */ + void (*fini)(struct xe_exec_queue *q); + /** @set_priority: Set priority for exec queue */ + int (*set_priority)(struct xe_exec_queue *q, + enum xe_exec_queue_priority priority); + /** @set_timeslice: Set timeslice for exec queue */ + int (*set_timeslice)(struct xe_exec_queue *q, u32 timeslice_us); + /** @set_preempt_timeout: Set preemption timeout for exec queue */ + int (*set_preempt_timeout)(struct xe_exec_queue *q, u32 preempt_timeout_us); + /** @set_job_timeout: Set job timeout for exec queue */ + int (*set_job_timeout)(struct xe_exec_queue *q, u32 job_timeout_ms); + /** + * @suspend: Suspend exec queue from executing, allowed to be called + * multiple times in a row before resume with the caveat that + * suspend_wait returns before calling suspend again. + */ + int (*suspend)(struct xe_exec_queue *q); + /** + * @suspend_wait: Wait for an exec queue to suspend executing, should be + * call after suspend. + */ + void (*suspend_wait)(struct xe_exec_queue *q); + /** + * @resume: Resume exec queue execution, exec queue must be in a suspended + * state and dma fence returned from most recent suspend call must be + * signalled when this function is called. + */ + void (*resume)(struct xe_exec_queue *q); + /** @reset_status: check exec queue reset status */ + bool (*reset_status)(struct xe_exec_queue *q); +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c new file mode 100644 index 000000000000..96b5224eb478 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_execlist.c @@ -0,0 +1,474 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_execlist.h" + +#include <drm/drm_managed.h> + +#include "instructions/xe_mi_commands.h" +#include "regs/xe_engine_regs.h" +#include "regs/xe_gpu_commands.h" +#include "regs/xe_gt_regs.h" +#include "regs/xe_lrc_layout.h" +#include "xe_assert.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_exec_queue.h" +#include "xe_gt.h" +#include "xe_hw_fence.h" +#include "xe_lrc.h" +#include "xe_macros.h" +#include "xe_mmio.h" +#include "xe_mocs.h" +#include "xe_ring_ops_types.h" +#include "xe_sched_job.h" + +#define XE_EXECLIST_HANG_LIMIT 1 + +#define SW_CTX_ID_SHIFT 37 +#define SW_CTX_ID_WIDTH 11 +#define XEHP_SW_CTX_ID_SHIFT 39 +#define XEHP_SW_CTX_ID_WIDTH 16 + +#define SW_CTX_ID \ + GENMASK_ULL(SW_CTX_ID_WIDTH + SW_CTX_ID_SHIFT - 1, \ + SW_CTX_ID_SHIFT) + +#define XEHP_SW_CTX_ID \ + GENMASK_ULL(XEHP_SW_CTX_ID_WIDTH + XEHP_SW_CTX_ID_SHIFT - 1, \ + XEHP_SW_CTX_ID_SHIFT) + + +static void __start_lrc(struct xe_hw_engine *hwe, struct xe_lrc *lrc, + u32 ctx_id) +{ + struct xe_gt *gt = hwe->gt; + struct xe_device *xe = gt_to_xe(gt); + u64 lrc_desc; + + lrc_desc = xe_lrc_descriptor(lrc); + + if (GRAPHICS_VERx100(xe) >= 1250) { + xe_gt_assert(hwe->gt, FIELD_FIT(XEHP_SW_CTX_ID, ctx_id)); + lrc_desc |= FIELD_PREP(XEHP_SW_CTX_ID, ctx_id); + } else { + xe_gt_assert(hwe->gt, FIELD_FIT(SW_CTX_ID, ctx_id)); + lrc_desc |= FIELD_PREP(SW_CTX_ID, ctx_id); + } + + if (hwe->class == XE_ENGINE_CLASS_COMPUTE) + xe_mmio_write32(hwe->gt, RCU_MODE, + _MASKED_BIT_ENABLE(RCU_MODE_CCS_ENABLE)); + + xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); + lrc->ring.old_tail = lrc->ring.tail; + + /* + * Make sure the context image is complete before we submit it to HW. + * + * Ostensibly, writes (including the WCB) should be flushed prior to + * an uncached write such as our mmio register access, the empirical + * evidence (esp. on Braswell) suggests that the WC write into memory + * may not be visible to the HW prior to the completion of the UC + * register write and that we may begin execution from the context + * before its image is complete leading to invalid PD chasing. + */ + wmb(); + + xe_mmio_write32(gt, RING_HWS_PGA(hwe->mmio_base), + xe_bo_ggtt_addr(hwe->hwsp)); + xe_mmio_read32(gt, RING_HWS_PGA(hwe->mmio_base)); + xe_mmio_write32(gt, RING_MODE(hwe->mmio_base), + _MASKED_BIT_ENABLE(GFX_DISABLE_LEGACY_MODE)); + + xe_mmio_write32(gt, RING_EXECLIST_SQ_CONTENTS_LO(hwe->mmio_base), + lower_32_bits(lrc_desc)); + xe_mmio_write32(gt, RING_EXECLIST_SQ_CONTENTS_HI(hwe->mmio_base), + upper_32_bits(lrc_desc)); + xe_mmio_write32(gt, RING_EXECLIST_CONTROL(hwe->mmio_base), + EL_CTRL_LOAD); +} + +static void __xe_execlist_port_start(struct xe_execlist_port *port, + struct xe_execlist_exec_queue *exl) +{ + struct xe_device *xe = gt_to_xe(port->hwe->gt); + int max_ctx = FIELD_MAX(SW_CTX_ID); + + if (GRAPHICS_VERx100(xe) >= 1250) + max_ctx = FIELD_MAX(XEHP_SW_CTX_ID); + + xe_execlist_port_assert_held(port); + + if (port->running_exl != exl || !exl->has_run) { + port->last_ctx_id++; + + /* 0 is reserved for the kernel context */ + if (port->last_ctx_id > max_ctx) + port->last_ctx_id = 1; + } + + __start_lrc(port->hwe, exl->q->lrc, port->last_ctx_id); + port->running_exl = exl; + exl->has_run = true; +} + +static void __xe_execlist_port_idle(struct xe_execlist_port *port) +{ + u32 noop[2] = { MI_NOOP, MI_NOOP }; + + xe_execlist_port_assert_held(port); + + if (!port->running_exl) + return; + + xe_lrc_write_ring(&port->hwe->kernel_lrc, noop, sizeof(noop)); + __start_lrc(port->hwe, &port->hwe->kernel_lrc, 0); + port->running_exl = NULL; +} + +static bool xe_execlist_is_idle(struct xe_execlist_exec_queue *exl) +{ + struct xe_lrc *lrc = exl->q->lrc; + + return lrc->ring.tail == lrc->ring.old_tail; +} + +static void __xe_execlist_port_start_next_active(struct xe_execlist_port *port) +{ + struct xe_execlist_exec_queue *exl = NULL; + int i; + + xe_execlist_port_assert_held(port); + + for (i = ARRAY_SIZE(port->active) - 1; i >= 0; i--) { + while (!list_empty(&port->active[i])) { + exl = list_first_entry(&port->active[i], + struct xe_execlist_exec_queue, + active_link); + list_del(&exl->active_link); + + if (xe_execlist_is_idle(exl)) { + exl->active_priority = XE_EXEC_QUEUE_PRIORITY_UNSET; + continue; + } + + list_add_tail(&exl->active_link, &port->active[i]); + __xe_execlist_port_start(port, exl); + return; + } + } + + __xe_execlist_port_idle(port); +} + +static u64 read_execlist_status(struct xe_hw_engine *hwe) +{ + struct xe_gt *gt = hwe->gt; + u32 hi, lo; + + lo = xe_mmio_read32(gt, RING_EXECLIST_STATUS_LO(hwe->mmio_base)); + hi = xe_mmio_read32(gt, RING_EXECLIST_STATUS_HI(hwe->mmio_base)); + + return lo | (u64)hi << 32; +} + +static void xe_execlist_port_irq_handler_locked(struct xe_execlist_port *port) +{ + u64 status; + + xe_execlist_port_assert_held(port); + + status = read_execlist_status(port->hwe); + if (status & BIT(7)) + return; + + __xe_execlist_port_start_next_active(port); +} + +static void xe_execlist_port_irq_handler(struct xe_hw_engine *hwe, + u16 intr_vec) +{ + struct xe_execlist_port *port = hwe->exl_port; + + spin_lock(&port->lock); + xe_execlist_port_irq_handler_locked(port); + spin_unlock(&port->lock); +} + +static void xe_execlist_port_wake_locked(struct xe_execlist_port *port, + enum xe_exec_queue_priority priority) +{ + xe_execlist_port_assert_held(port); + + if (port->running_exl && port->running_exl->active_priority >= priority) + return; + + __xe_execlist_port_start_next_active(port); +} + +static void xe_execlist_make_active(struct xe_execlist_exec_queue *exl) +{ + struct xe_execlist_port *port = exl->port; + enum xe_exec_queue_priority priority = exl->active_priority; + + XE_WARN_ON(priority == XE_EXEC_QUEUE_PRIORITY_UNSET); + XE_WARN_ON(priority < 0); + XE_WARN_ON(priority >= ARRAY_SIZE(exl->port->active)); + + spin_lock_irq(&port->lock); + + if (exl->active_priority != priority && + exl->active_priority != XE_EXEC_QUEUE_PRIORITY_UNSET) { + /* Priority changed, move it to the right list */ + list_del(&exl->active_link); + exl->active_priority = XE_EXEC_QUEUE_PRIORITY_UNSET; + } + + if (exl->active_priority == XE_EXEC_QUEUE_PRIORITY_UNSET) { + exl->active_priority = priority; + list_add_tail(&exl->active_link, &port->active[priority]); + } + + xe_execlist_port_wake_locked(exl->port, priority); + + spin_unlock_irq(&port->lock); +} + +static void xe_execlist_port_irq_fail_timer(struct timer_list *timer) +{ + struct xe_execlist_port *port = + container_of(timer, struct xe_execlist_port, irq_fail); + + spin_lock_irq(&port->lock); + xe_execlist_port_irq_handler_locked(port); + spin_unlock_irq(&port->lock); + + port->irq_fail.expires = jiffies + msecs_to_jiffies(1000); + add_timer(&port->irq_fail); +} + +struct xe_execlist_port *xe_execlist_port_create(struct xe_device *xe, + struct xe_hw_engine *hwe) +{ + struct drm_device *drm = &xe->drm; + struct xe_execlist_port *port; + int i; + + port = drmm_kzalloc(drm, sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + port->hwe = hwe; + + spin_lock_init(&port->lock); + for (i = 0; i < ARRAY_SIZE(port->active); i++) + INIT_LIST_HEAD(&port->active[i]); + + port->last_ctx_id = 1; + port->running_exl = NULL; + + hwe->irq_handler = xe_execlist_port_irq_handler; + + /* TODO: Fix the interrupt code so it doesn't race like mad */ + timer_setup(&port->irq_fail, xe_execlist_port_irq_fail_timer, 0); + port->irq_fail.expires = jiffies + msecs_to_jiffies(1000); + add_timer(&port->irq_fail); + + return port; +} + +void xe_execlist_port_destroy(struct xe_execlist_port *port) +{ + del_timer(&port->irq_fail); + + /* Prevent an interrupt while we're destroying */ + spin_lock_irq(>_to_xe(port->hwe->gt)->irq.lock); + port->hwe->irq_handler = NULL; + spin_unlock_irq(>_to_xe(port->hwe->gt)->irq.lock); +} + +static struct dma_fence * +execlist_run_job(struct drm_sched_job *drm_job) +{ + struct xe_sched_job *job = to_xe_sched_job(drm_job); + struct xe_exec_queue *q = job->q; + struct xe_execlist_exec_queue *exl = job->q->execlist; + + q->ring_ops->emit_job(job); + xe_execlist_make_active(exl); + + return dma_fence_get(job->fence); +} + +static void execlist_job_free(struct drm_sched_job *drm_job) +{ + struct xe_sched_job *job = to_xe_sched_job(drm_job); + + xe_sched_job_put(job); +} + +static const struct drm_sched_backend_ops drm_sched_ops = { + .run_job = execlist_run_job, + .free_job = execlist_job_free, +}; + +static int execlist_exec_queue_init(struct xe_exec_queue *q) +{ + struct drm_gpu_scheduler *sched; + struct xe_execlist_exec_queue *exl; + struct xe_device *xe = gt_to_xe(q->gt); + int err; + + xe_assert(xe, !xe_device_uc_enabled(xe)); + + drm_info(&xe->drm, "Enabling execlist submission (GuC submission disabled)\n"); + + exl = kzalloc(sizeof(*exl), GFP_KERNEL); + if (!exl) + return -ENOMEM; + + exl->q = q; + + err = drm_sched_init(&exl->sched, &drm_sched_ops, NULL, 1, + q->lrc[0].ring.size / MAX_JOB_SIZE_BYTES, + XE_SCHED_HANG_LIMIT, XE_SCHED_JOB_TIMEOUT, + NULL, NULL, q->hwe->name, + gt_to_xe(q->gt)->drm.dev); + if (err) + goto err_free; + + sched = &exl->sched; + err = drm_sched_entity_init(&exl->entity, 0, &sched, 1, NULL); + if (err) + goto err_sched; + + exl->port = q->hwe->exl_port; + exl->has_run = false; + exl->active_priority = XE_EXEC_QUEUE_PRIORITY_UNSET; + q->execlist = exl; + q->entity = &exl->entity; + + xe_exec_queue_assign_name(q, ffs(q->logical_mask) - 1); + + return 0; + +err_sched: + drm_sched_fini(&exl->sched); +err_free: + kfree(exl); + return err; +} + +static void execlist_exec_queue_fini_async(struct work_struct *w) +{ + struct xe_execlist_exec_queue *ee = + container_of(w, struct xe_execlist_exec_queue, fini_async); + struct xe_exec_queue *q = ee->q; + struct xe_execlist_exec_queue *exl = q->execlist; + struct xe_device *xe = gt_to_xe(q->gt); + unsigned long flags; + + xe_assert(xe, !xe_device_uc_enabled(xe)); + + spin_lock_irqsave(&exl->port->lock, flags); + if (WARN_ON(exl->active_priority != XE_EXEC_QUEUE_PRIORITY_UNSET)) + list_del(&exl->active_link); + spin_unlock_irqrestore(&exl->port->lock, flags); + + if (q->flags & EXEC_QUEUE_FLAG_PERSISTENT) + xe_device_remove_persistent_exec_queues(xe, q); + drm_sched_entity_fini(&exl->entity); + drm_sched_fini(&exl->sched); + kfree(exl); + + xe_exec_queue_fini(q); +} + +static void execlist_exec_queue_kill(struct xe_exec_queue *q) +{ + /* NIY */ +} + +static void execlist_exec_queue_fini(struct xe_exec_queue *q) +{ + INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async); + queue_work(system_unbound_wq, &q->execlist->fini_async); +} + +static int execlist_exec_queue_set_priority(struct xe_exec_queue *q, + enum xe_exec_queue_priority priority) +{ + /* NIY */ + return 0; +} + +static int execlist_exec_queue_set_timeslice(struct xe_exec_queue *q, u32 timeslice_us) +{ + /* NIY */ + return 0; +} + +static int execlist_exec_queue_set_preempt_timeout(struct xe_exec_queue *q, + u32 preempt_timeout_us) +{ + /* NIY */ + return 0; +} + +static int execlist_exec_queue_set_job_timeout(struct xe_exec_queue *q, + u32 job_timeout_ms) +{ + /* NIY */ + return 0; +} + +static int execlist_exec_queue_suspend(struct xe_exec_queue *q) +{ + /* NIY */ + return 0; +} + +static void execlist_exec_queue_suspend_wait(struct xe_exec_queue *q) + +{ + /* NIY */ +} + +static void execlist_exec_queue_resume(struct xe_exec_queue *q) +{ + /* NIY */ +} + +static bool execlist_exec_queue_reset_status(struct xe_exec_queue *q) +{ + /* NIY */ + return false; +} + +static const struct xe_exec_queue_ops execlist_exec_queue_ops = { + .init = execlist_exec_queue_init, + .kill = execlist_exec_queue_kill, + .fini = execlist_exec_queue_fini, + .set_priority = execlist_exec_queue_set_priority, + .set_timeslice = execlist_exec_queue_set_timeslice, + .set_preempt_timeout = execlist_exec_queue_set_preempt_timeout, + .set_job_timeout = execlist_exec_queue_set_job_timeout, + .suspend = execlist_exec_queue_suspend, + .suspend_wait = execlist_exec_queue_suspend_wait, + .resume = execlist_exec_queue_resume, + .reset_status = execlist_exec_queue_reset_status, +}; + +int xe_execlist_init(struct xe_gt *gt) +{ + /* GuC submission enabled, nothing to do */ + if (xe_device_uc_enabled(gt_to_xe(gt))) + return 0; + + gt->exec_queue_ops = &execlist_exec_queue_ops; + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_execlist.h b/drivers/gpu/drm/xe/xe_execlist.h new file mode 100644 index 000000000000..26f600ac8552 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_execlist.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_EXECLIST_H_ +#define _XE_EXECLIST_H_ + +#include "xe_execlist_types.h" + +struct xe_device; +struct xe_gt; + +#define xe_execlist_port_assert_held(port) lockdep_assert_held(&(port)->lock) + +int xe_execlist_init(struct xe_gt *gt); +struct xe_execlist_port *xe_execlist_port_create(struct xe_device *xe, + struct xe_hw_engine *hwe); +void xe_execlist_port_destroy(struct xe_execlist_port *port); + +#endif diff --git a/drivers/gpu/drm/xe/xe_execlist_types.h b/drivers/gpu/drm/xe/xe_execlist_types.h new file mode 100644 index 000000000000..f94bbf4c53e4 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_execlist_types.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_EXECLIST_TYPES_H_ +#define _XE_EXECLIST_TYPES_H_ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> + +#include "xe_exec_queue_types.h" + +struct xe_hw_engine; +struct xe_execlist_exec_queue; + +struct xe_execlist_port { + struct xe_hw_engine *hwe; + + spinlock_t lock; + + struct list_head active[XE_EXEC_QUEUE_PRIORITY_COUNT]; + + u32 last_ctx_id; + + struct xe_execlist_exec_queue *running_exl; + + struct timer_list irq_fail; +}; + +struct xe_execlist_exec_queue { + struct xe_exec_queue *q; + + struct drm_gpu_scheduler sched; + + struct drm_sched_entity entity; + + struct xe_execlist_port *port; + + bool has_run; + + struct work_struct fini_async; + + enum xe_exec_queue_priority active_priority; + struct list_head active_link; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_force_wake.c b/drivers/gpu/drm/xe/xe_force_wake.c new file mode 100644 index 000000000000..9bbe8a5040da --- /dev/null +++ b/drivers/gpu/drm/xe/xe_force_wake.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_force_wake.h" + +#include <drm/drm_util.h> + +#include "regs/xe_gt_regs.h" +#include "regs/xe_reg_defs.h" +#include "xe_gt.h" +#include "xe_mmio.h" + +#define XE_FORCE_WAKE_ACK_TIMEOUT_MS 50 + +static struct xe_gt * +fw_to_gt(struct xe_force_wake *fw) +{ + return fw->gt; +} + +static struct xe_device * +fw_to_xe(struct xe_force_wake *fw) +{ + return gt_to_xe(fw_to_gt(fw)); +} + +static void domain_init(struct xe_force_wake_domain *domain, + enum xe_force_wake_domain_id id, + struct xe_reg reg, struct xe_reg ack, u32 val, u32 mask) +{ + domain->id = id; + domain->reg_ctl = reg; + domain->reg_ack = ack; + domain->val = val; + domain->mask = mask; +} + +void xe_force_wake_init_gt(struct xe_gt *gt, struct xe_force_wake *fw) +{ + struct xe_device *xe = gt_to_xe(gt); + + fw->gt = gt; + spin_lock_init(&fw->lock); + + /* Assuming gen11+ so assert this assumption is correct */ + xe_gt_assert(gt, GRAPHICS_VER(gt_to_xe(gt)) >= 11); + + if (xe->info.graphics_verx100 >= 1270) { + domain_init(&fw->domains[XE_FW_DOMAIN_ID_GT], + XE_FW_DOMAIN_ID_GT, + FORCEWAKE_GT, + FORCEWAKE_ACK_GT_MTL, + BIT(0), BIT(16)); + } else { + domain_init(&fw->domains[XE_FW_DOMAIN_ID_GT], + XE_FW_DOMAIN_ID_GT, + FORCEWAKE_GT, + FORCEWAKE_ACK_GT, + BIT(0), BIT(16)); + } +} + +void xe_force_wake_init_engines(struct xe_gt *gt, struct xe_force_wake *fw) +{ + int i, j; + + /* Assuming gen11+ so assert this assumption is correct */ + xe_gt_assert(gt, GRAPHICS_VER(gt_to_xe(gt)) >= 11); + + if (!xe_gt_is_media_type(gt)) + domain_init(&fw->domains[XE_FW_DOMAIN_ID_RENDER], + XE_FW_DOMAIN_ID_RENDER, + FORCEWAKE_RENDER, + FORCEWAKE_ACK_RENDER, + BIT(0), BIT(16)); + + for (i = XE_HW_ENGINE_VCS0, j = 0; i <= XE_HW_ENGINE_VCS7; ++i, ++j) { + if (!(gt->info.engine_mask & BIT(i))) + continue; + + domain_init(&fw->domains[XE_FW_DOMAIN_ID_MEDIA_VDBOX0 + j], + XE_FW_DOMAIN_ID_MEDIA_VDBOX0 + j, + FORCEWAKE_MEDIA_VDBOX(j), + FORCEWAKE_ACK_MEDIA_VDBOX(j), + BIT(0), BIT(16)); + } + + for (i = XE_HW_ENGINE_VECS0, j = 0; i <= XE_HW_ENGINE_VECS3; ++i, ++j) { + if (!(gt->info.engine_mask & BIT(i))) + continue; + + domain_init(&fw->domains[XE_FW_DOMAIN_ID_MEDIA_VEBOX0 + j], + XE_FW_DOMAIN_ID_MEDIA_VEBOX0 + j, + FORCEWAKE_MEDIA_VEBOX(j), + FORCEWAKE_ACK_MEDIA_VEBOX(j), + BIT(0), BIT(16)); + } + + if (gt->info.engine_mask & BIT(XE_HW_ENGINE_GSCCS0)) + domain_init(&fw->domains[XE_FW_DOMAIN_ID_GSC], + XE_FW_DOMAIN_ID_GSC, + FORCEWAKE_GSC, + FORCEWAKE_ACK_GSC, + BIT(0), BIT(16)); +} + +static void domain_wake(struct xe_gt *gt, struct xe_force_wake_domain *domain) +{ + xe_mmio_write32(gt, domain->reg_ctl, domain->mask | domain->val); +} + +static int domain_wake_wait(struct xe_gt *gt, + struct xe_force_wake_domain *domain) +{ + return xe_mmio_wait32(gt, domain->reg_ack, domain->val, domain->val, + XE_FORCE_WAKE_ACK_TIMEOUT_MS * USEC_PER_MSEC, + NULL, true); +} + +static void domain_sleep(struct xe_gt *gt, struct xe_force_wake_domain *domain) +{ + xe_mmio_write32(gt, domain->reg_ctl, domain->mask); +} + +static int domain_sleep_wait(struct xe_gt *gt, + struct xe_force_wake_domain *domain) +{ + return xe_mmio_wait32(gt, domain->reg_ack, domain->val, 0, + XE_FORCE_WAKE_ACK_TIMEOUT_MS * USEC_PER_MSEC, + NULL, true); +} + +#define for_each_fw_domain_masked(domain__, mask__, fw__, tmp__) \ + for (tmp__ = (mask__); tmp__; tmp__ &= ~BIT(ffs(tmp__) - 1)) \ + for_each_if((domain__ = ((fw__)->domains + \ + (ffs(tmp__) - 1))) && \ + domain__->reg_ctl.addr) + +int xe_force_wake_get(struct xe_force_wake *fw, + enum xe_force_wake_domains domains) +{ + struct xe_device *xe = fw_to_xe(fw); + struct xe_gt *gt = fw_to_gt(fw); + struct xe_force_wake_domain *domain; + enum xe_force_wake_domains tmp, woken = 0; + unsigned long flags; + int ret, ret2 = 0; + + spin_lock_irqsave(&fw->lock, flags); + for_each_fw_domain_masked(domain, domains, fw, tmp) { + if (!domain->ref++) { + woken |= BIT(domain->id); + domain_wake(gt, domain); + } + } + for_each_fw_domain_masked(domain, woken, fw, tmp) { + ret = domain_wake_wait(gt, domain); + ret2 |= ret; + if (ret) + drm_notice(&xe->drm, "Force wake domain (%d) failed to ack wake, ret=%d\n", + domain->id, ret); + } + fw->awake_domains |= woken; + spin_unlock_irqrestore(&fw->lock, flags); + + return ret2; +} + +int xe_force_wake_put(struct xe_force_wake *fw, + enum xe_force_wake_domains domains) +{ + struct xe_device *xe = fw_to_xe(fw); + struct xe_gt *gt = fw_to_gt(fw); + struct xe_force_wake_domain *domain; + enum xe_force_wake_domains tmp, sleep = 0; + unsigned long flags; + int ret, ret2 = 0; + + spin_lock_irqsave(&fw->lock, flags); + for_each_fw_domain_masked(domain, domains, fw, tmp) { + if (!--domain->ref) { + sleep |= BIT(domain->id); + domain_sleep(gt, domain); + } + } + for_each_fw_domain_masked(domain, sleep, fw, tmp) { + ret = domain_sleep_wait(gt, domain); + ret2 |= ret; + if (ret) + drm_notice(&xe->drm, "Force wake domain (%d) failed to ack sleep, ret=%d\n", + domain->id, ret); + } + fw->awake_domains &= ~sleep; + spin_unlock_irqrestore(&fw->lock, flags); + + return ret2; +} diff --git a/drivers/gpu/drm/xe/xe_force_wake.h b/drivers/gpu/drm/xe/xe_force_wake.h new file mode 100644 index 000000000000..83cb157da7cc --- /dev/null +++ b/drivers/gpu/drm/xe/xe_force_wake.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_FORCE_WAKE_H_ +#define _XE_FORCE_WAKE_H_ + +#include "xe_assert.h" +#include "xe_force_wake_types.h" + +struct xe_gt; + +void xe_force_wake_init_gt(struct xe_gt *gt, + struct xe_force_wake *fw); +void xe_force_wake_init_engines(struct xe_gt *gt, + struct xe_force_wake *fw); +int xe_force_wake_get(struct xe_force_wake *fw, + enum xe_force_wake_domains domains); +int xe_force_wake_put(struct xe_force_wake *fw, + enum xe_force_wake_domains domains); + +static inline int +xe_force_wake_ref(struct xe_force_wake *fw, + enum xe_force_wake_domains domain) +{ + xe_gt_assert(fw->gt, domain); + return fw->domains[ffs(domain) - 1].ref; +} + +static inline void +xe_force_wake_assert_held(struct xe_force_wake *fw, + enum xe_force_wake_domains domain) +{ + xe_gt_assert(fw->gt, fw->awake_domains & domain); +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_force_wake_types.h b/drivers/gpu/drm/xe/xe_force_wake_types.h new file mode 100644 index 000000000000..ed0edc2cdf9f --- /dev/null +++ b/drivers/gpu/drm/xe/xe_force_wake_types.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_FORCE_WAKE_TYPES_H_ +#define _XE_FORCE_WAKE_TYPES_H_ + +#include <linux/mutex.h> +#include <linux/types.h> + +#include "regs/xe_reg_defs.h" + +enum xe_force_wake_domain_id { + XE_FW_DOMAIN_ID_GT = 0, + XE_FW_DOMAIN_ID_RENDER, + XE_FW_DOMAIN_ID_MEDIA, + XE_FW_DOMAIN_ID_MEDIA_VDBOX0, + XE_FW_DOMAIN_ID_MEDIA_VDBOX1, + XE_FW_DOMAIN_ID_MEDIA_VDBOX2, + XE_FW_DOMAIN_ID_MEDIA_VDBOX3, + XE_FW_DOMAIN_ID_MEDIA_VDBOX4, + XE_FW_DOMAIN_ID_MEDIA_VDBOX5, + XE_FW_DOMAIN_ID_MEDIA_VDBOX6, + XE_FW_DOMAIN_ID_MEDIA_VDBOX7, + XE_FW_DOMAIN_ID_MEDIA_VEBOX0, + XE_FW_DOMAIN_ID_MEDIA_VEBOX1, + XE_FW_DOMAIN_ID_MEDIA_VEBOX2, + XE_FW_DOMAIN_ID_MEDIA_VEBOX3, + XE_FW_DOMAIN_ID_GSC, + XE_FW_DOMAIN_ID_COUNT +}; + +enum xe_force_wake_domains { + XE_FW_GT = BIT(XE_FW_DOMAIN_ID_GT), + XE_FW_RENDER = BIT(XE_FW_DOMAIN_ID_RENDER), + XE_FW_MEDIA = BIT(XE_FW_DOMAIN_ID_MEDIA), + XE_FW_MEDIA_VDBOX0 = BIT(XE_FW_DOMAIN_ID_MEDIA_VDBOX0), + XE_FW_MEDIA_VDBOX1 = BIT(XE_FW_DOMAIN_ID_MEDIA_VDBOX1), + XE_FW_MEDIA_VDBOX2 = BIT(XE_FW_DOMAIN_ID_MEDIA_VDBOX2), + XE_FW_MEDIA_VDBOX3 = BIT(XE_FW_DOMAIN_ID_MEDIA_VDBOX3), + XE_FW_MEDIA_VDBOX4 = BIT(XE_FW_DOMAIN_ID_MEDIA_VDBOX4), + XE_FW_MEDIA_VDBOX5 = BIT(XE_FW_DOMAIN_ID_MEDIA_VDBOX5), + XE_FW_MEDIA_VDBOX6 = BIT(XE_FW_DOMAIN_ID_MEDIA_VDBOX6), + XE_FW_MEDIA_VDBOX7 = BIT(XE_FW_DOMAIN_ID_MEDIA_VDBOX7), + XE_FW_MEDIA_VEBOX0 = BIT(XE_FW_DOMAIN_ID_MEDIA_VEBOX0), + XE_FW_MEDIA_VEBOX1 = BIT(XE_FW_DOMAIN_ID_MEDIA_VEBOX1), + XE_FW_MEDIA_VEBOX2 = BIT(XE_FW_DOMAIN_ID_MEDIA_VEBOX2), + XE_FW_MEDIA_VEBOX3 = BIT(XE_FW_DOMAIN_ID_MEDIA_VEBOX3), + XE_FW_GSC = BIT(XE_FW_DOMAIN_ID_GSC), + XE_FORCEWAKE_ALL = BIT(XE_FW_DOMAIN_ID_COUNT) - 1 +}; + +/** + * struct xe_force_wake_domain - XE force wake domains + */ +struct xe_force_wake_domain { + /** @id: domain force wake id */ + enum xe_force_wake_domain_id id; + /** @reg_ctl: domain wake control register address */ + struct xe_reg reg_ctl; + /** @reg_ack: domain ack register address */ + struct xe_reg reg_ack; + /** @val: domain wake write value */ + u32 val; + /** @mask: domain mask */ + u32 mask; + /** @ref: domain reference */ + u32 ref; +}; + +/** + * struct xe_force_wake - XE force wake + */ +struct xe_force_wake { + /** @gt: back pointers to GT */ + struct xe_gt *gt; + /** @lock: protects everything force wake struct */ + spinlock_t lock; + /** @awake_domains: mask of all domains awake */ + enum xe_force_wake_domains awake_domains; + /** @domains: force wake domains */ + struct xe_force_wake_domain domains[XE_FW_DOMAIN_ID_COUNT]; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_gen_wa_oob.c b/drivers/gpu/drm/xe/xe_gen_wa_oob.c new file mode 100644 index 000000000000..106ee2b027f0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gen_wa_oob.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#define _GNU_SOURCE +#include <ctype.h> +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> + +#define HEADER \ + "// SPDX-License-Identifier: MIT\n" \ + "\n" \ + "/*\n" \ + " * DO NOT MODIFY.\n" \ + " *\n" \ + " * This file was generated from rules: %s\n" \ + " */\n" \ + "#ifndef _GENERATED_XE_WA_OOB_\n" \ + "#define _GENERATED_XE_WA_OOB_\n" \ + "\n" \ + "enum {\n" + +#define FOOTER \ + "};\n" \ + "\n" \ + "#endif\n" + +static void print_usage(FILE *f) +{ + fprintf(f, "usage: %s <input-rule-file> <generated-c-source-file> <generated-c-header-file>\n", + program_invocation_short_name); +} + +static void print_parse_error(const char *err_msg, const char *line, + unsigned int lineno) +{ + fprintf(stderr, "ERROR: %s\nERROR: %u: %.60s\n", + err_msg, lineno, line); +} + +static char *strip(char *line, size_t linelen) +{ + while (isspace(*(line + linelen))) + linelen--; + + line[linelen - 1] = '\0'; + + return line + strspn(line, " \f\n\r\t\v"); +} + +#define MAX_LINE_LEN 4096 +static int parse(FILE *input, FILE *csource, FILE *cheader) +{ + char line[MAX_LINE_LEN + 1]; + char *name, *prev_name = NULL, *rules; + unsigned int lineno = 0, idx = 0; + + while (fgets(line, sizeof(line), input)) { + size_t linelen; + bool is_continuation; + + if (line[0] == '\0' || line[0] == '#' || line[0] == '\n') { + lineno++; + continue; + } + + linelen = strlen(line); + if (linelen == MAX_LINE_LEN) { + print_parse_error("line too long", line, lineno); + return -EINVAL; + } + + is_continuation = isspace(line[0]); + name = strip(line, linelen); + + if (!is_continuation) { + name = strtok(name, " \t"); + rules = strtok(NULL, ""); + } else { + if (!prev_name) { + print_parse_error("invalid rule continuation", + line, lineno); + return -EINVAL; + } + + rules = name; + name = NULL; + } + + if (rules[0] == '\0') { + print_parse_error("invalid empty rule\n", line, lineno); + return -EINVAL; + } + + if (name) { + fprintf(cheader, "\tXE_WA_OOB_%s = %u,\n", name, idx); + fprintf(csource, "{ XE_RTP_NAME(\"%s\"), XE_RTP_RULES(%s) },\n", + name, rules); + } else { + fprintf(csource, "{ XE_RTP_NAME(NULL), XE_RTP_RULES(%s) },\n", + rules); + } + + idx++; + lineno++; + if (!is_continuation) + prev_name = name; + } + + fprintf(cheader, "\t_XE_WA_OOB_COUNT = %u\n", idx); + + return 0; +} + +int main(int argc, const char *argv[]) +{ + enum { + ARGS_INPUT, + ARGS_CSOURCE, + ARGS_CHEADER, + _ARGS_COUNT + }; + struct { + const char *fn; + const char *mode; + FILE *f; + } args[] = { + [ARGS_INPUT] = { .fn = argv[1], .mode = "r" }, + [ARGS_CSOURCE] = { .fn = argv[2], .mode = "w" }, + [ARGS_CHEADER] = { .fn = argv[3], .mode = "w" }, + }; + int ret = 1; + + if (argc < 3) { + fprintf(stderr, "ERROR: wrong arguments\n"); + print_usage(stderr); + return 1; + } + + for (int i = 0; i < _ARGS_COUNT; i++) { + args[i].f = fopen(args[i].fn, args[i].mode); + if (!args[i].f) { + fprintf(stderr, "ERROR: Can't open %s: %m\n", + args[i].fn); + goto err; + } + } + + fprintf(args[ARGS_CHEADER].f, HEADER, args[ARGS_INPUT].fn); + ret = parse(args[ARGS_INPUT].f, args[ARGS_CSOURCE].f, + args[ARGS_CHEADER].f); + if (!ret) + fprintf(args[ARGS_CHEADER].f, FOOTER); + +err: + for (int i = 0; i < _ARGS_COUNT; i++) { + if (args[i].f) + fclose(args[i].f); + } + + return ret; +} diff --git a/drivers/gpu/drm/xe/xe_ggtt.c b/drivers/gpu/drm/xe/xe_ggtt.c new file mode 100644 index 000000000000..3efd2d066bf7 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ggtt.c @@ -0,0 +1,428 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_ggtt.h" + +#include <linux/sizes.h> + +#include <drm/drm_managed.h> +#include <drm/i915_drm.h> + +#include "regs/xe_gt_regs.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_gt_tlb_invalidation.h" +#include "xe_map.h" +#include "xe_mmio.h" +#include "xe_wopcm.h" + +#define XELPG_GGTT_PTE_PAT0 BIT_ULL(52) +#define XELPG_GGTT_PTE_PAT1 BIT_ULL(53) + +/* GuC addresses above GUC_GGTT_TOP also don't map through the GTT */ +#define GUC_GGTT_TOP 0xFEE00000 + +static u64 xelp_ggtt_pte_encode_bo(struct xe_bo *bo, u64 bo_offset, + u16 pat_index) +{ + u64 pte; + + pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE); + pte |= XE_PAGE_PRESENT; + + if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo)) + pte |= XE_GGTT_PTE_DM; + + return pte; +} + +static u64 xelpg_ggtt_pte_encode_bo(struct xe_bo *bo, u64 bo_offset, + u16 pat_index) +{ + struct xe_device *xe = xe_bo_device(bo); + u64 pte; + + pte = xelp_ggtt_pte_encode_bo(bo, bo_offset, pat_index); + + xe_assert(xe, pat_index <= 3); + + if (pat_index & BIT(0)) + pte |= XELPG_GGTT_PTE_PAT0; + + if (pat_index & BIT(1)) + pte |= XELPG_GGTT_PTE_PAT1; + + return pte; +} + +static unsigned int probe_gsm_size(struct pci_dev *pdev) +{ + u16 gmch_ctl, ggms; + + pci_read_config_word(pdev, SNB_GMCH_CTRL, &gmch_ctl); + ggms = (gmch_ctl >> BDW_GMCH_GGMS_SHIFT) & BDW_GMCH_GGMS_MASK; + return ggms ? SZ_1M << ggms : 0; +} + +void xe_ggtt_set_pte(struct xe_ggtt *ggtt, u64 addr, u64 pte) +{ + xe_tile_assert(ggtt->tile, !(addr & XE_PTE_MASK)); + xe_tile_assert(ggtt->tile, addr < ggtt->size); + + writeq(pte, &ggtt->gsm[addr >> XE_PTE_SHIFT]); +} + +static void xe_ggtt_clear(struct xe_ggtt *ggtt, u64 start, u64 size) +{ + u16 pat_index = tile_to_xe(ggtt->tile)->pat.idx[XE_CACHE_WB]; + u64 end = start + size - 1; + u64 scratch_pte; + + xe_tile_assert(ggtt->tile, start < end); + + if (ggtt->scratch) + scratch_pte = ggtt->pt_ops->pte_encode_bo(ggtt->scratch, 0, + pat_index); + else + scratch_pte = 0; + + while (start < end) { + xe_ggtt_set_pte(ggtt, start, scratch_pte); + start += XE_PAGE_SIZE; + } +} + +static void ggtt_fini_early(struct drm_device *drm, void *arg) +{ + struct xe_ggtt *ggtt = arg; + + mutex_destroy(&ggtt->lock); + drm_mm_takedown(&ggtt->mm); +} + +static void ggtt_fini(struct drm_device *drm, void *arg) +{ + struct xe_ggtt *ggtt = arg; + + ggtt->scratch = NULL; +} + +static void primelockdep(struct xe_ggtt *ggtt) +{ + if (!IS_ENABLED(CONFIG_LOCKDEP)) + return; + + fs_reclaim_acquire(GFP_KERNEL); + might_lock(&ggtt->lock); + fs_reclaim_release(GFP_KERNEL); +} + +static const struct xe_ggtt_pt_ops xelp_pt_ops = { + .pte_encode_bo = xelp_ggtt_pte_encode_bo, +}; + +static const struct xe_ggtt_pt_ops xelpg_pt_ops = { + .pte_encode_bo = xelpg_ggtt_pte_encode_bo, +}; + +/* + * Early GGTT initialization, which allows to create new mappings usable by the + * GuC. + * Mappings are not usable by the HW engines, as it doesn't have scratch / + * initial clear done to it yet. That will happen in the regular, non-early + * GGTT init. + */ +int xe_ggtt_init_early(struct xe_ggtt *ggtt) +{ + struct xe_device *xe = tile_to_xe(ggtt->tile); + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + unsigned int gsm_size; + + gsm_size = probe_gsm_size(pdev); + if (gsm_size == 0) { + drm_err(&xe->drm, "Hardware reported no preallocated GSM\n"); + return -ENOMEM; + } + + ggtt->gsm = ggtt->tile->mmio.regs + SZ_8M; + ggtt->size = (gsm_size / 8) * (u64) XE_PAGE_SIZE; + + if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K) + ggtt->flags |= XE_GGTT_FLAGS_64K; + + /* + * 8B per entry, each points to a 4KB page. + * + * The GuC address space is limited on both ends of the GGTT, because + * the GuC shim HW redirects accesses to those addresses to other HW + * areas instead of going through the GGTT. On the bottom end, the GuC + * can't access offsets below the WOPCM size, while on the top side the + * limit is fixed at GUC_GGTT_TOP. To keep things simple, instead of + * checking each object to see if they are accessed by GuC or not, we + * just exclude those areas from the allocator. Additionally, to + * simplify the driver load, we use the maximum WOPCM size in this logic + * instead of the programmed one, so we don't need to wait until the + * actual size to be programmed is determined (which requires FW fetch) + * before initializing the GGTT. These simplifications might waste space + * in the GGTT (about 20-25 MBs depending on the platform) but we can + * live with this. + * + * Another benifit of this is the GuC bootrom can't access anything + * below the WOPCM max size so anything the bootom needs to access (e.g. + * a RSA key) needs to be placed in the GGTT above the WOPCM max size. + * Starting the GGTT allocations above the WOPCM max give us the correct + * placement for free. + */ + if (ggtt->size > GUC_GGTT_TOP) + ggtt->size = GUC_GGTT_TOP; + + if (GRAPHICS_VERx100(xe) >= 1270) + ggtt->pt_ops = &xelpg_pt_ops; + else + ggtt->pt_ops = &xelp_pt_ops; + + drm_mm_init(&ggtt->mm, xe_wopcm_size(xe), + ggtt->size - xe_wopcm_size(xe)); + mutex_init(&ggtt->lock); + primelockdep(ggtt); + + return drmm_add_action_or_reset(&xe->drm, ggtt_fini_early, ggtt); +} + +static void xe_ggtt_initial_clear(struct xe_ggtt *ggtt) +{ + struct drm_mm_node *hole; + u64 start, end; + + /* Display may have allocated inside ggtt, so be careful with clearing here */ + xe_device_mem_access_get(tile_to_xe(ggtt->tile)); + mutex_lock(&ggtt->lock); + drm_mm_for_each_hole(hole, &ggtt->mm, start, end) + xe_ggtt_clear(ggtt, start, end - start); + + xe_ggtt_invalidate(ggtt); + mutex_unlock(&ggtt->lock); + xe_device_mem_access_put(tile_to_xe(ggtt->tile)); +} + +int xe_ggtt_init(struct xe_ggtt *ggtt) +{ + struct xe_device *xe = tile_to_xe(ggtt->tile); + unsigned int flags; + int err; + + /* + * So we don't need to worry about 64K GGTT layout when dealing with + * scratch entires, rather keep the scratch page in system memory on + * platforms where 64K pages are needed for VRAM. + */ + flags = XE_BO_CREATE_PINNED_BIT; + if (ggtt->flags & XE_GGTT_FLAGS_64K) + flags |= XE_BO_CREATE_SYSTEM_BIT; + else + flags |= XE_BO_CREATE_VRAM_IF_DGFX(ggtt->tile); + + ggtt->scratch = xe_managed_bo_create_pin_map(xe, ggtt->tile, XE_PAGE_SIZE, flags); + if (IS_ERR(ggtt->scratch)) { + err = PTR_ERR(ggtt->scratch); + goto err; + } + + xe_map_memset(xe, &ggtt->scratch->vmap, 0, 0, ggtt->scratch->size); + + xe_ggtt_initial_clear(ggtt); + + return drmm_add_action_or_reset(&xe->drm, ggtt_fini, ggtt); +err: + ggtt->scratch = NULL; + return err; +} + +#define GUC_TLB_INV_CR XE_REG(0xcee8) +#define GUC_TLB_INV_CR_INVALIDATE REG_BIT(0) +#define PVC_GUC_TLB_INV_DESC0 XE_REG(0xcf7c) +#define PVC_GUC_TLB_INV_DESC0_VALID REG_BIT(0) +#define PVC_GUC_TLB_INV_DESC1 XE_REG(0xcf80) +#define PVC_GUC_TLB_INV_DESC1_INVALIDATE REG_BIT(6) + +static void ggtt_invalidate_gt_tlb(struct xe_gt *gt) +{ + if (!gt) + return; + + /* + * Invalidation can happen when there's no in-flight work keeping the + * GT awake. We need to explicitly grab forcewake to ensure the GT + * and GuC are accessible. + */ + xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + + /* TODO: vfunc for GuC vs. non-GuC */ + + if (gt->uc.guc.submission_state.enabled) { + int seqno; + + seqno = xe_gt_tlb_invalidation_guc(gt); + xe_gt_assert(gt, seqno > 0); + if (seqno > 0) + xe_gt_tlb_invalidation_wait(gt, seqno); + } else if (xe_device_uc_enabled(gt_to_xe(gt))) { + struct xe_device *xe = gt_to_xe(gt); + + if (xe->info.platform == XE_PVC || GRAPHICS_VER(xe) >= 20) { + xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC1, + PVC_GUC_TLB_INV_DESC1_INVALIDATE); + xe_mmio_write32(gt, PVC_GUC_TLB_INV_DESC0, + PVC_GUC_TLB_INV_DESC0_VALID); + } else + xe_mmio_write32(gt, GUC_TLB_INV_CR, + GUC_TLB_INV_CR_INVALIDATE); + } + + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +} + +void xe_ggtt_invalidate(struct xe_ggtt *ggtt) +{ + /* Each GT in a tile has its own TLB to cache GGTT lookups */ + ggtt_invalidate_gt_tlb(ggtt->tile->primary_gt); + ggtt_invalidate_gt_tlb(ggtt->tile->media_gt); +} + +void xe_ggtt_printk(struct xe_ggtt *ggtt, const char *prefix) +{ + u16 pat_index = tile_to_xe(ggtt->tile)->pat.idx[XE_CACHE_WB]; + u64 addr, scratch_pte; + + scratch_pte = ggtt->pt_ops->pte_encode_bo(ggtt->scratch, 0, pat_index); + + printk("%sGlobal GTT:", prefix); + for (addr = 0; addr < ggtt->size; addr += XE_PAGE_SIZE) { + unsigned int i = addr / XE_PAGE_SIZE; + + xe_tile_assert(ggtt->tile, addr <= U32_MAX); + if (ggtt->gsm[i] == scratch_pte) + continue; + + printk("%s ggtt[0x%08x] = 0x%016llx", + prefix, (u32)addr, ggtt->gsm[i]); + } +} + +int xe_ggtt_insert_special_node_locked(struct xe_ggtt *ggtt, struct drm_mm_node *node, + u32 size, u32 align, u32 mm_flags) +{ + return drm_mm_insert_node_generic(&ggtt->mm, node, size, align, 0, + mm_flags); +} + +int xe_ggtt_insert_special_node(struct xe_ggtt *ggtt, struct drm_mm_node *node, + u32 size, u32 align) +{ + int ret; + + mutex_lock(&ggtt->lock); + ret = xe_ggtt_insert_special_node_locked(ggtt, node, size, + align, DRM_MM_INSERT_HIGH); + mutex_unlock(&ggtt->lock); + + return ret; +} + +void xe_ggtt_map_bo(struct xe_ggtt *ggtt, struct xe_bo *bo) +{ + u16 pat_index = tile_to_xe(ggtt->tile)->pat.idx[XE_CACHE_WB]; + u64 start = bo->ggtt_node.start; + u64 offset, pte; + + for (offset = 0; offset < bo->size; offset += XE_PAGE_SIZE) { + pte = ggtt->pt_ops->pte_encode_bo(bo, offset, pat_index); + xe_ggtt_set_pte(ggtt, start + offset, pte); + } + + xe_ggtt_invalidate(ggtt); +} + +static int __xe_ggtt_insert_bo_at(struct xe_ggtt *ggtt, struct xe_bo *bo, + u64 start, u64 end) +{ + int err; + u64 alignment = XE_PAGE_SIZE; + + if (xe_bo_is_vram(bo) && ggtt->flags & XE_GGTT_FLAGS_64K) + alignment = SZ_64K; + + if (XE_WARN_ON(bo->ggtt_node.size)) { + /* Someone's already inserted this BO in the GGTT */ + xe_tile_assert(ggtt->tile, bo->ggtt_node.size == bo->size); + return 0; + } + + err = xe_bo_validate(bo, NULL, false); + if (err) + return err; + + xe_device_mem_access_get(tile_to_xe(ggtt->tile)); + mutex_lock(&ggtt->lock); + err = drm_mm_insert_node_in_range(&ggtt->mm, &bo->ggtt_node, bo->size, + alignment, 0, start, end, 0); + if (!err) + xe_ggtt_map_bo(ggtt, bo); + mutex_unlock(&ggtt->lock); + xe_device_mem_access_put(tile_to_xe(ggtt->tile)); + + return err; +} + +int xe_ggtt_insert_bo_at(struct xe_ggtt *ggtt, struct xe_bo *bo, + u64 start, u64 end) +{ + return __xe_ggtt_insert_bo_at(ggtt, bo, start, end); +} + +int xe_ggtt_insert_bo(struct xe_ggtt *ggtt, struct xe_bo *bo) +{ + return __xe_ggtt_insert_bo_at(ggtt, bo, 0, U64_MAX); +} + +void xe_ggtt_remove_node(struct xe_ggtt *ggtt, struct drm_mm_node *node) +{ + xe_device_mem_access_get(tile_to_xe(ggtt->tile)); + mutex_lock(&ggtt->lock); + + xe_ggtt_clear(ggtt, node->start, node->size); + drm_mm_remove_node(node); + node->size = 0; + + xe_ggtt_invalidate(ggtt); + + mutex_unlock(&ggtt->lock); + xe_device_mem_access_put(tile_to_xe(ggtt->tile)); +} + +void xe_ggtt_remove_bo(struct xe_ggtt *ggtt, struct xe_bo *bo) +{ + if (XE_WARN_ON(!bo->ggtt_node.size)) + return; + + /* This BO is not currently in the GGTT */ + xe_tile_assert(ggtt->tile, bo->ggtt_node.size == bo->size); + + xe_ggtt_remove_node(ggtt, &bo->ggtt_node); +} + +int xe_ggtt_dump(struct xe_ggtt *ggtt, struct drm_printer *p) +{ + int err; + + err = mutex_lock_interruptible(&ggtt->lock); + if (err) + return err; + + drm_mm_print(&ggtt->mm, p); + mutex_unlock(&ggtt->lock); + return err; +} diff --git a/drivers/gpu/drm/xe/xe_ggtt.h b/drivers/gpu/drm/xe/xe_ggtt.h new file mode 100644 index 000000000000..a09c166dff70 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ggtt.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_GGTT_H_ +#define _XE_GGTT_H_ + +#include "xe_ggtt_types.h" + +struct drm_printer; + +void xe_ggtt_set_pte(struct xe_ggtt *ggtt, u64 addr, u64 pte); +void xe_ggtt_invalidate(struct xe_ggtt *ggtt); +int xe_ggtt_init_early(struct xe_ggtt *ggtt); +int xe_ggtt_init(struct xe_ggtt *ggtt); +void xe_ggtt_printk(struct xe_ggtt *ggtt, const char *prefix); + +int xe_ggtt_insert_special_node(struct xe_ggtt *ggtt, struct drm_mm_node *node, + u32 size, u32 align); +int xe_ggtt_insert_special_node_locked(struct xe_ggtt *ggtt, + struct drm_mm_node *node, + u32 size, u32 align, u32 mm_flags); +void xe_ggtt_remove_node(struct xe_ggtt *ggtt, struct drm_mm_node *node); +void xe_ggtt_map_bo(struct xe_ggtt *ggtt, struct xe_bo *bo); +int xe_ggtt_insert_bo(struct xe_ggtt *ggtt, struct xe_bo *bo); +int xe_ggtt_insert_bo_at(struct xe_ggtt *ggtt, struct xe_bo *bo, + u64 start, u64 end); +void xe_ggtt_remove_bo(struct xe_ggtt *ggtt, struct xe_bo *bo); + +int xe_ggtt_dump(struct xe_ggtt *ggtt, struct drm_printer *p); + +#endif diff --git a/drivers/gpu/drm/xe/xe_ggtt_types.h b/drivers/gpu/drm/xe/xe_ggtt_types.h new file mode 100644 index 000000000000..d8c584d9a8c3 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ggtt_types.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GGTT_TYPES_H_ +#define _XE_GGTT_TYPES_H_ + +#include <drm/drm_mm.h> + +#include "xe_pt_types.h" + +struct xe_bo; +struct xe_gt; + +struct xe_ggtt_pt_ops { + u64 (*pte_encode_bo)(struct xe_bo *bo, u64 bo_offset, u16 pat_index); +}; + +struct xe_ggtt { + struct xe_tile *tile; + + u64 size; + +#define XE_GGTT_FLAGS_64K BIT(0) + unsigned int flags; + + struct xe_bo *scratch; + + struct mutex lock; + + u64 __iomem *gsm; + + const struct xe_ggtt_pt_ops *pt_ops; + + struct drm_mm mm; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.c b/drivers/gpu/drm/xe/xe_gpu_scheduler.c new file mode 100644 index 000000000000..e4ad1d6ce1d5 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_gpu_scheduler.h" + +static void xe_sched_process_msg_queue(struct xe_gpu_scheduler *sched) +{ + if (!READ_ONCE(sched->base.pause_submit)) + queue_work(sched->base.submit_wq, &sched->work_process_msg); +} + +static void xe_sched_process_msg_queue_if_ready(struct xe_gpu_scheduler *sched) +{ + struct xe_sched_msg *msg; + + spin_lock(&sched->base.job_list_lock); + msg = list_first_entry_or_null(&sched->msgs, struct xe_sched_msg, link); + if (msg) + xe_sched_process_msg_queue(sched); + spin_unlock(&sched->base.job_list_lock); +} + +static struct xe_sched_msg * +xe_sched_get_msg(struct xe_gpu_scheduler *sched) +{ + struct xe_sched_msg *msg; + + spin_lock(&sched->base.job_list_lock); + msg = list_first_entry_or_null(&sched->msgs, + struct xe_sched_msg, link); + if (msg) + list_del(&msg->link); + spin_unlock(&sched->base.job_list_lock); + + return msg; +} + +static void xe_sched_process_msg_work(struct work_struct *w) +{ + struct xe_gpu_scheduler *sched = + container_of(w, struct xe_gpu_scheduler, work_process_msg); + struct xe_sched_msg *msg; + + if (READ_ONCE(sched->base.pause_submit)) + return; + + msg = xe_sched_get_msg(sched); + if (msg) { + sched->ops->process_msg(msg); + + xe_sched_process_msg_queue_if_ready(sched); + } +} + +int xe_sched_init(struct xe_gpu_scheduler *sched, + const struct drm_sched_backend_ops *ops, + const struct xe_sched_backend_ops *xe_ops, + struct workqueue_struct *submit_wq, + uint32_t hw_submission, unsigned hang_limit, + long timeout, struct workqueue_struct *timeout_wq, + atomic_t *score, const char *name, + struct device *dev) +{ + sched->ops = xe_ops; + INIT_LIST_HEAD(&sched->msgs); + INIT_WORK(&sched->work_process_msg, xe_sched_process_msg_work); + + return drm_sched_init(&sched->base, ops, submit_wq, 1, hw_submission, + hang_limit, timeout, timeout_wq, score, name, + dev); +} + +void xe_sched_fini(struct xe_gpu_scheduler *sched) +{ + xe_sched_submission_stop(sched); + drm_sched_fini(&sched->base); +} + +void xe_sched_submission_start(struct xe_gpu_scheduler *sched) +{ + drm_sched_wqueue_start(&sched->base); + queue_work(sched->base.submit_wq, &sched->work_process_msg); +} + +void xe_sched_submission_stop(struct xe_gpu_scheduler *sched) +{ + drm_sched_wqueue_stop(&sched->base); + cancel_work_sync(&sched->work_process_msg); +} + +void xe_sched_add_msg(struct xe_gpu_scheduler *sched, + struct xe_sched_msg *msg) +{ + spin_lock(&sched->base.job_list_lock); + list_add_tail(&msg->link, &sched->msgs); + spin_unlock(&sched->base.job_list_lock); + + xe_sched_process_msg_queue(sched); +} diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h b/drivers/gpu/drm/xe/xe_gpu_scheduler.h new file mode 100644 index 000000000000..10c6bb9c9386 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GPU_SCHEDULER_H_ +#define _XE_GPU_SCHEDULER_H_ + +#include "xe_gpu_scheduler_types.h" +#include "xe_sched_job_types.h" + +int xe_sched_init(struct xe_gpu_scheduler *sched, + const struct drm_sched_backend_ops *ops, + const struct xe_sched_backend_ops *xe_ops, + struct workqueue_struct *submit_wq, + uint32_t hw_submission, unsigned hang_limit, + long timeout, struct workqueue_struct *timeout_wq, + atomic_t *score, const char *name, + struct device *dev); +void xe_sched_fini(struct xe_gpu_scheduler *sched); + +void xe_sched_submission_start(struct xe_gpu_scheduler *sched); +void xe_sched_submission_stop(struct xe_gpu_scheduler *sched); + +void xe_sched_add_msg(struct xe_gpu_scheduler *sched, + struct xe_sched_msg *msg); + +static inline void xe_sched_stop(struct xe_gpu_scheduler *sched) +{ + drm_sched_stop(&sched->base, NULL); +} + +static inline void xe_sched_tdr_queue_imm(struct xe_gpu_scheduler *sched) +{ + drm_sched_tdr_queue_imm(&sched->base); +} + +static inline void xe_sched_resubmit_jobs(struct xe_gpu_scheduler *sched) +{ + drm_sched_resubmit_jobs(&sched->base); +} + +static inline bool +xe_sched_invalidate_job(struct xe_sched_job *job, int threshold) +{ + return drm_sched_invalidate_job(&job->drm, threshold); +} + +static inline void xe_sched_add_pending_job(struct xe_gpu_scheduler *sched, + struct xe_sched_job *job) +{ + list_add(&job->drm.list, &sched->base.pending_list); +} + +static inline +struct xe_sched_job *xe_sched_first_pending_job(struct xe_gpu_scheduler *sched) +{ + return list_first_entry_or_null(&sched->base.pending_list, + struct xe_sched_job, drm.list); +} + +static inline int +xe_sched_entity_init(struct xe_sched_entity *entity, + struct xe_gpu_scheduler *sched) +{ + return drm_sched_entity_init(entity, 0, + (struct drm_gpu_scheduler **)&sched, + 1, NULL); +} + +#define xe_sched_entity_fini drm_sched_entity_fini + +#endif diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler_types.h b/drivers/gpu/drm/xe/xe_gpu_scheduler_types.h new file mode 100644 index 000000000000..6731b13da8bb --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler_types.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GPU_SCHEDULER_TYPES_H_ +#define _XE_GPU_SCHEDULER_TYPES_H_ + +#include <drm/gpu_scheduler.h> + +/** + * struct xe_sched_msg - an in-band (relative to GPU scheduler run queue) + * message + * + * Generic enough for backend defined messages, backend can expand if needed. + */ +struct xe_sched_msg { + /** @link: list link into the gpu scheduler list of messages */ + struct list_head link; + /** + * @private_data: opaque pointer to message private data (backend defined) + */ + void *private_data; + /** @opcode: opcode of message (backend defined) */ + unsigned int opcode; +}; + +/** + * struct xe_sched_backend_ops - Define the backend operations called by the + * scheduler + */ +struct xe_sched_backend_ops { + /** + * @process_msg: Process a message. Allowed to block, it is this + * function's responsibility to free message if dynamically allocated. + */ + void (*process_msg)(struct xe_sched_msg *msg); +}; + +/** + * struct xe_gpu_scheduler - Xe GPU scheduler + */ +struct xe_gpu_scheduler { + /** @base: DRM GPU scheduler */ + struct drm_gpu_scheduler base; + /** @ops: Xe scheduler ops */ + const struct xe_sched_backend_ops *ops; + /** @msgs: list of messages to be processed in @work_process_msg */ + struct list_head msgs; + /** @work_process_msg: processes messages */ + struct work_struct work_process_msg; +}; + +#define xe_sched_entity drm_sched_entity +#define xe_sched_policy drm_sched_policy + +#endif diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c new file mode 100644 index 000000000000..a8a895cf4b44 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gsc.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_gsc.h" + +#include <drm/drm_managed.h> + +#include "abi/gsc_mkhi_commands_abi.h" +#include "generated/xe_wa_oob.h" +#include "xe_bb.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_exec_queue.h" +#include "xe_gsc_submit.h" +#include "xe_gt.h" +#include "xe_gt_printk.h" +#include "xe_huc.h" +#include "xe_map.h" +#include "xe_mmio.h" +#include "xe_sched_job.h" +#include "xe_uc_fw.h" +#include "xe_wa.h" +#include "instructions/xe_gsc_commands.h" +#include "regs/xe_gsc_regs.h" + +static struct xe_gt * +gsc_to_gt(struct xe_gsc *gsc) +{ + return container_of(gsc, struct xe_gt, uc.gsc); +} + +static int memcpy_fw(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_device *xe = gt_to_xe(gt); + u32 fw_size = gsc->fw.size; + void *storage; + + /* + * FIXME: xe_migrate_copy does not work with stolen mem yet, so we use + * a memcpy for now. + */ + storage = kmalloc(fw_size, GFP_KERNEL); + if (!storage) + return -ENOMEM; + + xe_map_memcpy_from(xe, storage, &gsc->fw.bo->vmap, 0, fw_size); + xe_map_memcpy_to(xe, &gsc->private->vmap, 0, storage, fw_size); + xe_map_memset(xe, &gsc->private->vmap, fw_size, 0, gsc->private->size - fw_size); + + kfree(storage); + + return 0; +} + +static int emit_gsc_upload(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + u64 offset = xe_bo_ggtt_addr(gsc->private); + struct xe_bb *bb; + struct xe_sched_job *job; + struct dma_fence *fence; + long timeout; + + bb = xe_bb_new(gt, 4, false); + if (IS_ERR(bb)) + return PTR_ERR(bb); + + bb->cs[bb->len++] = GSC_FW_LOAD; + bb->cs[bb->len++] = lower_32_bits(offset); + bb->cs[bb->len++] = upper_32_bits(offset); + bb->cs[bb->len++] = (gsc->private->size / SZ_4K) | GSC_FW_LOAD_LIMIT_VALID; + + job = xe_bb_create_job(gsc->q, bb); + if (IS_ERR(job)) { + xe_bb_free(bb, NULL); + return PTR_ERR(job); + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + timeout = dma_fence_wait_timeout(fence, false, HZ); + dma_fence_put(fence); + xe_bb_free(bb, NULL); + if (timeout < 0) + return timeout; + else if (!timeout) + return -ETIME; + + return 0; +} + +#define version_query_wr(xe_, map_, offset_, field_, val_) \ + xe_map_wr_field(xe_, map_, offset_, struct gsc_get_compatibility_version_in, field_, val_) +#define version_query_rd(xe_, map_, offset_, field_) \ + xe_map_rd_field(xe_, map_, offset_, struct gsc_get_compatibility_version_out, field_) + +static u32 emit_version_query_msg(struct xe_device *xe, struct iosys_map *map, u32 wr_offset) +{ + xe_map_memset(xe, map, wr_offset, 0, sizeof(struct gsc_get_compatibility_version_in)); + + version_query_wr(xe, map, wr_offset, header.group_id, MKHI_GROUP_ID_GFX_SRV); + version_query_wr(xe, map, wr_offset, header.command, + MKHI_GFX_SRV_GET_HOST_COMPATIBILITY_VERSION); + + return wr_offset + sizeof(struct gsc_get_compatibility_version_in); +} + +#define GSC_VER_PKT_SZ SZ_4K /* 4K each for input and output */ +static int query_compatibility_version(struct xe_gsc *gsc) +{ + struct xe_uc_fw_version *compat = &gsc->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY]; + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_device *xe = gt_to_xe(gt); + struct xe_bo *bo; + u32 wr_offset; + u32 rd_offset; + u64 ggtt_offset; + int err; + + bo = xe_bo_create_pin_map(xe, tile, NULL, GSC_VER_PKT_SZ * 2, + ttm_bo_type_kernel, + XE_BO_CREATE_SYSTEM_BIT | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) { + xe_gt_err(gt, "failed to allocate bo for GSC version query\n"); + return PTR_ERR(bo); + } + + ggtt_offset = xe_bo_ggtt_addr(bo); + + wr_offset = xe_gsc_emit_header(xe, &bo->vmap, 0, HECI_MEADDRESS_MKHI, 0, + sizeof(struct gsc_get_compatibility_version_in)); + wr_offset = emit_version_query_msg(xe, &bo->vmap, wr_offset); + + err = xe_gsc_pkt_submit_kernel(gsc, ggtt_offset, wr_offset, + ggtt_offset + GSC_VER_PKT_SZ, + GSC_VER_PKT_SZ); + if (err) { + xe_gt_err(gt, + "failed to submit GSC request for compatibility version: %d\n", + err); + goto out_bo; + } + + err = xe_gsc_read_out_header(xe, &bo->vmap, GSC_VER_PKT_SZ, + sizeof(struct gsc_get_compatibility_version_out), + &rd_offset); + if (err) { + xe_gt_err(gt, "HuC: invalid GSC reply for version query (err=%d)\n", err); + return err; + } + + compat->major = version_query_rd(xe, &bo->vmap, rd_offset, compat_major); + compat->minor = version_query_rd(xe, &bo->vmap, rd_offset, compat_minor); + + xe_gt_info(gt, "found GSC cv%u.%u\n", compat->major, compat->minor); + +out_bo: + xe_bo_unpin_map_no_vm(bo); + return err; +} + +static int gsc_fw_is_loaded(struct xe_gt *gt) +{ + return xe_mmio_read32(gt, HECI_FWSTS1(MTL_GSC_HECI1_BASE)) & + HECI1_FWSTS1_INIT_COMPLETE; +} + +static int gsc_fw_wait(struct xe_gt *gt) +{ + /* + * GSC load can take up to 250ms from the moment the instruction is + * executed by the GSCCS. To account for possible submission delays or + * other issues, we use a 500ms timeout in the wait here. + */ + return xe_mmio_wait32(gt, HECI_FWSTS1(MTL_GSC_HECI1_BASE), + HECI1_FWSTS1_INIT_COMPLETE, + HECI1_FWSTS1_INIT_COMPLETE, + 500 * USEC_PER_MSEC, NULL, false); +} + +static int gsc_upload(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_device *xe = gt_to_xe(gt); + int err; + + /* we should only be here if the init step were successful */ + xe_assert(xe, xe_uc_fw_is_loadable(&gsc->fw) && gsc->q); + + if (gsc_fw_is_loaded(gt)) { + xe_gt_err(gt, "GSC already loaded at upload time\n"); + return -EEXIST; + } + + err = memcpy_fw(gsc); + if (err) { + xe_gt_err(gt, "Failed to memcpy GSC FW\n"); + return err; + } + + /* + * GSC is only killed by an FLR, so we need to trigger one on unload to + * make sure we stop it. This is because we assign a chunk of memory to + * the GSC as part of the FW load, so we need to make sure it stops + * using it when we release it to the system on driver unload. Note that + * this is not a problem of the unload per-se, because the GSC will not + * touch that memory unless there are requests for it coming from the + * driver; therefore, no accesses will happen while Xe is not loaded, + * but if we re-load the driver then the GSC might wake up and try to + * access that old memory location again. + * Given that an FLR is a very disruptive action (see the FLR function + * for details), we want to do it as the last action before releasing + * the access to the MMIO bar, which means we need to do it as part of + * mmio cleanup. + */ + xe->needs_flr_on_fini = true; + + err = emit_gsc_upload(gsc); + if (err) { + xe_gt_err(gt, "Failed to emit GSC FW upload (%pe)\n", ERR_PTR(err)); + return err; + } + + err = gsc_fw_wait(gt); + if (err) { + xe_gt_err(gt, "Failed to wait for GSC load (%pe)\n", ERR_PTR(err)); + return err; + } + + err = query_compatibility_version(gsc); + if (err) + return err; + + err = xe_uc_fw_check_version_requirements(&gsc->fw); + if (err) + return err; + + xe_gt_dbg(gt, "GSC FW async load completed\n"); + + return 0; +} + +static void gsc_work(struct work_struct *work) +{ + struct xe_gsc *gsc = container_of(work, typeof(*gsc), work); + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_device *xe = gt_to_xe(gt); + int ret; + + xe_device_mem_access_get(xe); + xe_force_wake_get(gt_to_fw(gt), XE_FW_GSC); + + ret = gsc_upload(gsc); + if (ret && ret != -EEXIST) { + xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_LOAD_FAIL); + goto out; + } + + xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_TRANSFERRED); + + /* HuC auth failure is not fatal */ + if (xe_huc_is_authenticated(>->uc.huc, XE_HUC_AUTH_VIA_GUC)) + xe_huc_auth(>->uc.huc, XE_HUC_AUTH_VIA_GSC); + +out: + xe_force_wake_put(gt_to_fw(gt), XE_FW_GSC); + xe_device_mem_access_put(xe); +} + +int xe_gsc_init(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_tile *tile = gt_to_tile(gt); + int ret; + + gsc->fw.type = XE_UC_FW_TYPE_GSC; + INIT_WORK(&gsc->work, gsc_work); + + /* The GSC uC is only available on the media GT */ + if (tile->media_gt && (gt != tile->media_gt)) { + xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_NOT_SUPPORTED); + return 0; + } + + /* + * Some platforms can have GuC but not GSC. That would cause + * xe_uc_fw_init(gsc) to return a "not supported" failure code and abort + * all firmware loading. So check for GSC being enabled before + * propagating the failure back up. That way the higher level will keep + * going and load GuC as appropriate. + */ + ret = xe_uc_fw_init(&gsc->fw); + if (!xe_uc_fw_is_enabled(&gsc->fw)) + return 0; + else if (ret) + goto out; + + return 0; + +out: + xe_gt_err(gt, "GSC init failed with %d", ret); + return ret; +} + +static void free_resources(struct drm_device *drm, void *arg) +{ + struct xe_gsc *gsc = arg; + + if (gsc->wq) { + destroy_workqueue(gsc->wq); + gsc->wq = NULL; + } + + if (gsc->q) { + xe_exec_queue_put(gsc->q); + gsc->q = NULL; + } + + if (gsc->private) { + xe_bo_unpin_map_no_vm(gsc->private); + gsc->private = NULL; + } +} + +int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_device *xe = gt_to_xe(gt); + struct xe_hw_engine *hwe = xe_gt_hw_engine(gt, XE_ENGINE_CLASS_OTHER, 0, true); + struct xe_exec_queue *q; + struct workqueue_struct *wq; + struct xe_bo *bo; + int err; + + if (!xe_uc_fw_is_available(&gsc->fw)) + return 0; + + if (!hwe) + return -ENODEV; + + bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4M, + ttm_bo_type_kernel, + XE_BO_CREATE_STOLEN_BIT | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + q = xe_exec_queue_create(xe, NULL, + BIT(hwe->logical_instance), 1, hwe, + EXEC_QUEUE_FLAG_KERNEL | + EXEC_QUEUE_FLAG_PERMANENT); + if (IS_ERR(q)) { + xe_gt_err(gt, "Failed to create queue for GSC submission\n"); + err = PTR_ERR(q); + goto out_bo; + } + + wq = alloc_ordered_workqueue("gsc-ordered-wq", 0); + if (!wq) { + err = -ENOMEM; + goto out_q; + } + + gsc->private = bo; + gsc->q = q; + gsc->wq = wq; + + err = drmm_add_action_or_reset(&xe->drm, free_resources, gsc); + if (err) + return err; + + xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_LOADABLE); + + return 0; + +out_q: + xe_exec_queue_put(q); +out_bo: + xe_bo_unpin_map_no_vm(bo); + return err; +} + +void xe_gsc_load_start(struct xe_gsc *gsc) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + + if (!xe_uc_fw_is_loadable(&gsc->fw) || !gsc->q) + return; + + /* GSC FW survives GT reset and D3Hot */ + if (gsc_fw_is_loaded(gt)) { + xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_TRANSFERRED); + return; + } + + queue_work(gsc->wq, &gsc->work); +} + +void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc) +{ + if (xe_uc_fw_is_loadable(&gsc->fw) && gsc->wq) + flush_work(&gsc->work); +} + +/* + * wa_14015076503: if the GSC FW is loaded, we need to alert it before doing a + * GSC engine reset by writing a notification bit in the GS1 register and then + * triggering an interrupt to GSC; from the interrupt it will take up to 200ms + * for the FW to get prepare for the reset, so we need to wait for that amount + * of time. + * After the reset is complete we need to then clear the GS1 register. + */ +void xe_gsc_wa_14015076503(struct xe_gt *gt, bool prep) +{ + u32 gs1_set = prep ? HECI_H_GS1_ER_PREP : 0; + u32 gs1_clr = prep ? 0 : HECI_H_GS1_ER_PREP; + + /* WA only applies if the GSC is loaded */ + if (!XE_WA(gt, 14015076503) || !gsc_fw_is_loaded(gt)) + return; + + xe_mmio_rmw32(gt, HECI_H_GS1(MTL_GSC_HECI2_BASE), gs1_clr, gs1_set); + + if (prep) { + /* make sure the reset bit is clear when writing the CSR reg */ + xe_mmio_rmw32(gt, HECI_H_CSR(MTL_GSC_HECI2_BASE), + HECI_H_CSR_RST, HECI_H_CSR_IG); + msleep(200); + } +} diff --git a/drivers/gpu/drm/xe/xe_gsc.h b/drivers/gpu/drm/xe/xe_gsc.h new file mode 100644 index 000000000000..bc1ef7f31ea2 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gsc.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GSC_H_ +#define _XE_GSC_H_ + +#include "xe_gsc_types.h" + +struct xe_gt; + +int xe_gsc_init(struct xe_gsc *gsc); +int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc); +void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc); +void xe_gsc_load_start(struct xe_gsc *gsc); + +void xe_gsc_wa_14015076503(struct xe_gt *gt, bool prep); + +#endif diff --git a/drivers/gpu/drm/xe/xe_gsc_submit.c b/drivers/gpu/drm/xe/xe_gsc_submit.c new file mode 100644 index 000000000000..8c5381e5913f --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gsc_submit.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_gsc_submit.h" + +#include "abi/gsc_command_header_abi.h" +#include "xe_bb.h" +#include "xe_exec_queue.h" +#include "xe_gt_printk.h" +#include "xe_gt_types.h" +#include "xe_map.h" +#include "xe_sched_job.h" +#include "instructions/xe_gsc_commands.h" +#include "regs/xe_gsc_regs.h" + +#define GSC_HDR_SIZE (sizeof(struct intel_gsc_mtl_header)) /* shorthand define */ + +#define mtl_gsc_header_wr(xe_, map_, offset_, field_, val_) \ + xe_map_wr_field(xe_, map_, offset_, struct intel_gsc_mtl_header, field_, val_) + +#define mtl_gsc_header_rd(xe_, map_, offset_, field_) \ + xe_map_rd_field(xe_, map_, offset_, struct intel_gsc_mtl_header, field_) + +/* + * GSC FW allows us to define the host_session_handle as we see fit, as long + * as we use unique identifier for each user, with handle 0 being reserved for + * kernel usage. + * To be able to differentiate which client subsystem owns the given session, we + * include the client id in the top 8 bits of the handle. + */ +#define HOST_SESSION_CLIENT_MASK GENMASK_ULL(63, 56) + +static struct xe_gt * +gsc_to_gt(struct xe_gsc *gsc) +{ + return container_of(gsc, struct xe_gt, uc.gsc); +} + +/** + * xe_gsc_emit_header - write the MTL GSC header in memory + * @xe: the Xe device + * @map: the iosys map to write to + * @offset: offset from the start of the map at which to write the header + * @heci_client_id: client id identifying the type of command (see abi for values) + * @host_session_id: host session ID of the caller + * @payload_size: size of the payload that follows the header + * + * Returns: offset memory location following the header + */ +u32 xe_gsc_emit_header(struct xe_device *xe, struct iosys_map *map, u32 offset, + u8 heci_client_id, u64 host_session_id, u32 payload_size) +{ + xe_assert(xe, !(host_session_id & HOST_SESSION_CLIENT_MASK)); + + if (host_session_id) + host_session_id |= FIELD_PREP(HOST_SESSION_CLIENT_MASK, heci_client_id); + + xe_map_memset(xe, map, offset, 0, GSC_HDR_SIZE); + + mtl_gsc_header_wr(xe, map, offset, validity_marker, GSC_HECI_VALIDITY_MARKER); + mtl_gsc_header_wr(xe, map, offset, heci_client_id, heci_client_id); + mtl_gsc_header_wr(xe, map, offset, host_session_handle, host_session_id); + mtl_gsc_header_wr(xe, map, offset, header_version, MTL_GSC_HEADER_VERSION); + mtl_gsc_header_wr(xe, map, offset, message_size, payload_size + GSC_HDR_SIZE); + + return offset + GSC_HDR_SIZE; +}; + +/** + * xe_gsc_check_and_update_pending - check the pending bit and update the input + * header with the retry handle from the output header + * @xe: the Xe device + * @in: the iosys map containing the input buffer + * @offset_in: offset within the iosys at which the input buffer is located + * @out: the iosys map containing the output buffer + * @offset_out: offset within the iosys at which the output buffer is located + * + * Returns: true if the pending bit was set, false otherwise + */ +bool xe_gsc_check_and_update_pending(struct xe_device *xe, + struct iosys_map *in, u32 offset_in, + struct iosys_map *out, u32 offset_out) +{ + if (mtl_gsc_header_rd(xe, out, offset_out, flags) & GSC_OUTFLAG_MSG_PENDING) { + u64 handle = mtl_gsc_header_rd(xe, out, offset_out, gsc_message_handle); + + mtl_gsc_header_wr(xe, in, offset_in, gsc_message_handle, handle); + + return true; + } + + return false; +} + +/** + * xe_gsc_read_out_header - reads and validates the output header and returns + * the offset of the reply following the header + * @xe: the Xe device + * @map: the iosys map containing the output buffer + * @offset: offset within the iosys at which the output buffer is located + * @min_payload_size: minimum size of the message excluding the gsc header + * @payload_offset: optional pointer to be set to the payload offset + * + * Returns: -errno value on failure, 0 otherwise + */ +int xe_gsc_read_out_header(struct xe_device *xe, + struct iosys_map *map, u32 offset, + u32 min_payload_size, + u32 *payload_offset) +{ + u32 marker = mtl_gsc_header_rd(xe, map, offset, validity_marker); + u32 size = mtl_gsc_header_rd(xe, map, offset, message_size); + u32 payload_size = size - GSC_HDR_SIZE; + + if (marker != GSC_HECI_VALIDITY_MARKER) + return -EPROTO; + + if (size < GSC_HDR_SIZE || payload_size < min_payload_size) + return -ENODATA; + + if (payload_offset) + *payload_offset = offset + GSC_HDR_SIZE; + + return 0; +} + +/** + * xe_gsc_pkt_submit_kernel - submit a kernel heci pkt to the GSC + * @gsc: the GSC uC + * @addr_in: GGTT address of the message to send to the GSC + * @size_in: size of the message to send to the GSC + * @addr_out: GGTT address for the GSC to write the reply to + * @size_out: size of the memory reserved for the reply + */ +int xe_gsc_pkt_submit_kernel(struct xe_gsc *gsc, u64 addr_in, u32 size_in, + u64 addr_out, u32 size_out) +{ + struct xe_gt *gt = gsc_to_gt(gsc); + struct xe_bb *bb; + struct xe_sched_job *job; + struct dma_fence *fence; + long timeout; + + if (size_in < GSC_HDR_SIZE) + return -ENODATA; + + if (size_out < GSC_HDR_SIZE) + return -ENOMEM; + + bb = xe_bb_new(gt, 8, false); + if (IS_ERR(bb)) + return PTR_ERR(bb); + + bb->cs[bb->len++] = GSC_HECI_CMD_PKT; + bb->cs[bb->len++] = lower_32_bits(addr_in); + bb->cs[bb->len++] = upper_32_bits(addr_in); + bb->cs[bb->len++] = size_in; + bb->cs[bb->len++] = lower_32_bits(addr_out); + bb->cs[bb->len++] = upper_32_bits(addr_out); + bb->cs[bb->len++] = size_out; + bb->cs[bb->len++] = 0; + + job = xe_bb_create_job(gsc->q, bb); + if (IS_ERR(job)) { + xe_bb_free(bb, NULL); + return PTR_ERR(job); + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + timeout = dma_fence_wait_timeout(fence, false, HZ); + dma_fence_put(fence); + xe_bb_free(bb, NULL); + if (timeout < 0) + return timeout; + else if (!timeout) + return -ETIME; + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_gsc_submit.h b/drivers/gpu/drm/xe/xe_gsc_submit.h new file mode 100644 index 000000000000..0801da5d446a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gsc_submit.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GSC_SUBMIT_H_ +#define _XE_GSC_SUBMIT_H_ + +#include <linux/types.h> + +struct iosys_map; +struct xe_device; +struct xe_gsc; + +u32 xe_gsc_emit_header(struct xe_device *xe, struct iosys_map *map, u32 offset, + u8 heci_client_id, u64 host_session_id, u32 payload_size); + +bool xe_gsc_check_and_update_pending(struct xe_device *xe, + struct iosys_map *in, u32 offset_in, + struct iosys_map *out, u32 offset_out); + +int xe_gsc_read_out_header(struct xe_device *xe, + struct iosys_map *map, u32 offset, + u32 min_payload_size, + u32 *payload_offset); + +int xe_gsc_pkt_submit_kernel(struct xe_gsc *gsc, u64 addr_in, u32 size_in, + u64 addr_out, u32 size_out); + +#endif diff --git a/drivers/gpu/drm/xe/xe_gsc_types.h b/drivers/gpu/drm/xe/xe_gsc_types.h new file mode 100644 index 000000000000..57fefd66a7ea --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gsc_types.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GSC_TYPES_H_ +#define _XE_GSC_TYPES_H_ + +#include <linux/workqueue.h> + +#include "xe_uc_fw_types.h" + +struct xe_bo; +struct xe_exec_queue; + +/** + * struct xe_gsc - GSC + */ +struct xe_gsc { + /** @fw: Generic uC firmware management */ + struct xe_uc_fw fw; + + /** @security_version: SVN found in the fetched blob */ + u32 security_version; + + /** @private: Private data for use by the GSC FW */ + struct xe_bo *private; + + /** @q: Default queue used for submissions to GSC FW */ + struct xe_exec_queue *q; + + /** @wq: workqueue to handle jobs for delayed load and proxy handling */ + struct workqueue_struct *wq; + + /** @work: delayed load and proxy handling work */ + struct work_struct work; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c new file mode 100644 index 000000000000..3af2adec1295 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -0,0 +1,778 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_gt.h" + +#include <linux/minmax.h> + +#include <drm/drm_managed.h> +#include <drm/xe_drm.h> + +#include "instructions/xe_gfxpipe_commands.h" +#include "instructions/xe_mi_commands.h" +#include "regs/xe_gt_regs.h" +#include "xe_assert.h" +#include "xe_bb.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_exec_queue.h" +#include "xe_execlist.h" +#include "xe_force_wake.h" +#include "xe_ggtt.h" +#include "xe_gsc.h" +#include "xe_gt_ccs_mode.h" +#include "xe_gt_clock.h" +#include "xe_gt_freq.h" +#include "xe_gt_idle.h" +#include "xe_gt_mcr.h" +#include "xe_gt_pagefault.h" +#include "xe_gt_printk.h" +#include "xe_gt_sysfs.h" +#include "xe_gt_tlb_invalidation.h" +#include "xe_gt_topology.h" +#include "xe_guc_exec_queue_types.h" +#include "xe_guc_pc.h" +#include "xe_hw_fence.h" +#include "xe_hw_engine_class_sysfs.h" +#include "xe_irq.h" +#include "xe_lmtt.h" +#include "xe_lrc.h" +#include "xe_map.h" +#include "xe_migrate.h" +#include "xe_mmio.h" +#include "xe_pat.h" +#include "xe_mocs.h" +#include "xe_reg_sr.h" +#include "xe_ring_ops.h" +#include "xe_sa.h" +#include "xe_sched_job.h" +#include "xe_sriov.h" +#include "xe_tuning.h" +#include "xe_uc.h" +#include "xe_vm.h" +#include "xe_wa.h" +#include "xe_wopcm.h" + +struct xe_gt *xe_gt_alloc(struct xe_tile *tile) +{ + struct xe_gt *gt; + + gt = drmm_kzalloc(&tile_to_xe(tile)->drm, sizeof(*gt), GFP_KERNEL); + if (!gt) + return ERR_PTR(-ENOMEM); + + gt->tile = tile; + gt->ordered_wq = alloc_ordered_workqueue("gt-ordered-wq", 0); + + return gt; +} + +void xe_gt_sanitize(struct xe_gt *gt) +{ + /* + * FIXME: if xe_uc_sanitize is called here, on TGL driver will not + * reload + */ + gt->uc.guc.submission_state.enabled = false; +} + +static void gt_fini(struct drm_device *drm, void *arg) +{ + struct xe_gt *gt = arg; + int i; + + destroy_workqueue(gt->ordered_wq); + + for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i) + xe_hw_fence_irq_finish(>->fence_irq[i]); +} + +static void gt_reset_worker(struct work_struct *w); + +static int emit_nop_job(struct xe_gt *gt, struct xe_exec_queue *q) +{ + struct xe_sched_job *job; + struct xe_bb *bb; + struct dma_fence *fence; + long timeout; + + bb = xe_bb_new(gt, 4, false); + if (IS_ERR(bb)) + return PTR_ERR(bb); + + job = xe_bb_create_job(q, bb); + if (IS_ERR(job)) { + xe_bb_free(bb, NULL); + return PTR_ERR(job); + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + timeout = dma_fence_wait_timeout(fence, false, HZ); + dma_fence_put(fence); + xe_bb_free(bb, NULL); + if (timeout < 0) + return timeout; + else if (!timeout) + return -ETIME; + + return 0; +} + +/* + * Convert back from encoded value to type-safe, only to be used when reg.mcr + * is true + */ +static struct xe_reg_mcr to_xe_reg_mcr(const struct xe_reg reg) +{ + return (const struct xe_reg_mcr){.__reg.raw = reg.raw }; +} + +static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) +{ + struct xe_reg_sr *sr = &q->hwe->reg_lrc; + struct xe_reg_sr_entry *entry; + unsigned long idx; + struct xe_sched_job *job; + struct xe_bb *bb; + struct dma_fence *fence; + long timeout; + int count = 0; + + if (q->hwe->class == XE_ENGINE_CLASS_RENDER) + /* Big enough to emit all of the context's 3DSTATE */ + bb = xe_bb_new(gt, xe_lrc_size(gt_to_xe(gt), q->hwe->class), false); + else + /* Just pick a large BB size */ + bb = xe_bb_new(gt, SZ_4K, false); + + if (IS_ERR(bb)) + return PTR_ERR(bb); + + xa_for_each(&sr->xa, idx, entry) + ++count; + + if (count) { + xe_gt_dbg(gt, "LRC WA %s save-restore batch\n", sr->name); + + bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); + + xa_for_each(&sr->xa, idx, entry) { + struct xe_reg reg = entry->reg; + struct xe_reg_mcr reg_mcr = to_xe_reg_mcr(reg); + u32 val; + + /* + * Skip reading the register if it's not really needed + */ + if (reg.masked) + val = entry->clr_bits << 16; + else if (entry->clr_bits + 1) + val = (reg.mcr ? + xe_gt_mcr_unicast_read_any(gt, reg_mcr) : + xe_mmio_read32(gt, reg)) & (~entry->clr_bits); + else + val = 0; + + val |= entry->set_bits; + + bb->cs[bb->len++] = reg.addr; + bb->cs[bb->len++] = val; + xe_gt_dbg(gt, "REG[0x%x] = 0x%08x", reg.addr, val); + } + } + + xe_lrc_emit_hwe_state_instructions(q, bb); + + job = xe_bb_create_job(q, bb); + if (IS_ERR(job)) { + xe_bb_free(bb, NULL); + return PTR_ERR(job); + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + timeout = dma_fence_wait_timeout(fence, false, HZ); + dma_fence_put(fence); + xe_bb_free(bb, NULL); + if (timeout < 0) + return timeout; + else if (!timeout) + return -ETIME; + + return 0; +} + +int xe_gt_record_default_lrcs(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + int err = 0; + + for_each_hw_engine(hwe, gt, id) { + struct xe_exec_queue *q, *nop_q; + void *default_lrc; + + if (gt->default_lrc[hwe->class]) + continue; + + xe_reg_sr_init(&hwe->reg_lrc, hwe->name, xe); + xe_wa_process_lrc(hwe); + xe_hw_engine_setup_default_lrc_state(hwe); + xe_tuning_process_lrc(hwe); + + default_lrc = drmm_kzalloc(&xe->drm, + xe_lrc_size(xe, hwe->class), + GFP_KERNEL); + if (!default_lrc) + return -ENOMEM; + + q = xe_exec_queue_create(xe, NULL, BIT(hwe->logical_instance), 1, + hwe, EXEC_QUEUE_FLAG_KERNEL); + if (IS_ERR(q)) { + err = PTR_ERR(q); + xe_gt_err(gt, "hwe %s: xe_exec_queue_create failed (%pe)\n", + hwe->name, q); + return err; + } + + /* Prime golden LRC with known good state */ + err = emit_wa_job(gt, q); + if (err) { + xe_gt_err(gt, "hwe %s: emit_wa_job failed (%pe) guc_id=%u\n", + hwe->name, ERR_PTR(err), q->guc->id); + goto put_exec_queue; + } + + nop_q = xe_exec_queue_create(xe, NULL, BIT(hwe->logical_instance), + 1, hwe, EXEC_QUEUE_FLAG_KERNEL); + if (IS_ERR(nop_q)) { + err = PTR_ERR(nop_q); + xe_gt_err(gt, "hwe %s: nop xe_exec_queue_create failed (%pe)\n", + hwe->name, nop_q); + goto put_exec_queue; + } + + /* Switch to different LRC */ + err = emit_nop_job(gt, nop_q); + if (err) { + xe_gt_err(gt, "hwe %s: nop emit_nop_job failed (%pe) guc_id=%u\n", + hwe->name, ERR_PTR(err), nop_q->guc->id); + goto put_nop_q; + } + + /* Reload golden LRC to record the effect of any indirect W/A */ + err = emit_nop_job(gt, q); + if (err) { + xe_gt_err(gt, "hwe %s: emit_nop_job failed (%pe) guc_id=%u\n", + hwe->name, ERR_PTR(err), q->guc->id); + goto put_nop_q; + } + + xe_map_memcpy_from(xe, default_lrc, + &q->lrc[0].bo->vmap, + xe_lrc_pphwsp_offset(&q->lrc[0]), + xe_lrc_size(xe, hwe->class)); + + gt->default_lrc[hwe->class] = default_lrc; +put_nop_q: + xe_exec_queue_put(nop_q); +put_exec_queue: + xe_exec_queue_put(q); + if (err) + break; + } + + return err; +} + +int xe_gt_init_early(struct xe_gt *gt) +{ + int err; + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + return err; + + xe_gt_topology_init(gt); + xe_gt_mcr_init(gt); + + err = xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); + if (err) + return err; + + xe_reg_sr_init(>->reg_sr, "GT", gt_to_xe(gt)); + + err = xe_wa_init(gt); + if (err) + return err; + + xe_wa_process_gt(gt); + xe_wa_process_oob(gt); + xe_tuning_process_gt(gt); + + return 0; +} + +static void dump_pat_on_error(struct xe_gt *gt) +{ + struct drm_printer p; + char prefix[32]; + + snprintf(prefix, sizeof(prefix), "[GT%u Error]", gt->info.id); + p = drm_debug_printer(prefix); + + xe_pat_dump(gt, &p); +} + +static int gt_fw_domain_init(struct xe_gt *gt) +{ + int err, i; + + xe_device_mem_access_get(gt_to_xe(gt)); + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + goto err_hw_fence_irq; + + xe_pat_init(gt); + + if (!xe_gt_is_media_type(gt)) { + err = xe_ggtt_init(gt_to_tile(gt)->mem.ggtt); + if (err) + goto err_force_wake; + if (IS_SRIOV_PF(gt_to_xe(gt))) + xe_lmtt_init(>_to_tile(gt)->sriov.pf.lmtt); + } + + err = xe_uc_init(>->uc); + if (err) + goto err_force_wake; + + /* Raise GT freq to speed up HuC/GuC load */ + xe_guc_pc_init_early(>->uc.guc.pc); + + err = xe_uc_init_hwconfig(>->uc); + if (err) + goto err_force_wake; + + xe_gt_idle_sysfs_init(>->gtidle); + + /* XXX: Fake that we pull the engine mask from hwconfig blob */ + gt->info.engine_mask = gt->info.__engine_mask; + + /* Enable per hw engine IRQs */ + xe_irq_enable_hwe(gt); + + /* Rerun MCR init as we now have hw engine list */ + xe_gt_mcr_init(gt); + + err = xe_hw_engines_init_early(gt); + if (err) + goto err_force_wake; + + err = xe_hw_engine_class_sysfs_init(gt); + if (err) + drm_warn(>_to_xe(gt)->drm, + "failed to register engines sysfs directory, err: %d\n", + err); + + /* Initialize CCS mode sysfs after early initialization of HW engines */ + xe_gt_ccs_mode_sysfs_init(gt); + + err = xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); + XE_WARN_ON(err); + xe_device_mem_access_put(gt_to_xe(gt)); + + return 0; + +err_force_wake: + dump_pat_on_error(gt); + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +err_hw_fence_irq: + for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i) + xe_hw_fence_irq_finish(>->fence_irq[i]); + xe_device_mem_access_put(gt_to_xe(gt)); + + return err; +} + +static int all_fw_domain_init(struct xe_gt *gt) +{ + int err, i; + + xe_device_mem_access_get(gt_to_xe(gt)); + err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (err) + goto err_hw_fence_irq; + + xe_gt_mcr_set_implicit_defaults(gt); + xe_reg_sr_apply_mmio(>->reg_sr, gt); + + err = xe_gt_clock_init(gt); + if (err) + goto err_force_wake; + + xe_mocs_init(gt); + err = xe_execlist_init(gt); + if (err) + goto err_force_wake; + + err = xe_hw_engines_init(gt); + if (err) + goto err_force_wake; + + err = xe_uc_init_post_hwconfig(>->uc); + if (err) + goto err_force_wake; + + if (!xe_gt_is_media_type(gt)) { + /* + * USM has its only SA pool to non-block behind user operations + */ + if (gt_to_xe(gt)->info.has_usm) { + gt->usm.bb_pool = xe_sa_bo_manager_init(gt_to_tile(gt), SZ_1M, 16); + if (IS_ERR(gt->usm.bb_pool)) { + err = PTR_ERR(gt->usm.bb_pool); + goto err_force_wake; + } + } + } + + if (!xe_gt_is_media_type(gt)) { + struct xe_tile *tile = gt_to_tile(gt); + + tile->migrate = xe_migrate_init(tile); + if (IS_ERR(tile->migrate)) { + err = PTR_ERR(tile->migrate); + goto err_force_wake; + } + } + + err = xe_uc_init_hw(>->uc); + if (err) + goto err_force_wake; + + /* Configure default CCS mode of 1 engine with all resources */ + if (xe_gt_ccs_mode_enabled(gt)) { + gt->ccs_mode = 1; + xe_gt_apply_ccs_mode(gt); + } + + if (IS_SRIOV_PF(gt_to_xe(gt)) && !xe_gt_is_media_type(gt)) + xe_lmtt_init_hw(>_to_tile(gt)->sriov.pf.lmtt); + + err = xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL); + XE_WARN_ON(err); + xe_device_mem_access_put(gt_to_xe(gt)); + + return 0; + +err_force_wake: + xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL); +err_hw_fence_irq: + for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i) + xe_hw_fence_irq_finish(>->fence_irq[i]); + xe_device_mem_access_put(gt_to_xe(gt)); + + return err; +} + +int xe_gt_init(struct xe_gt *gt) +{ + int err; + int i; + + INIT_WORK(>->reset.worker, gt_reset_worker); + + for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i) { + gt->ring_ops[i] = xe_ring_ops_get(gt, i); + xe_hw_fence_irq_init(>->fence_irq[i]); + } + + err = xe_gt_tlb_invalidation_init(gt); + if (err) + return err; + + err = xe_gt_pagefault_init(gt); + if (err) + return err; + + xe_mocs_init_early(gt); + + xe_gt_sysfs_init(gt); + + err = gt_fw_domain_init(gt); + if (err) + return err; + + xe_gt_freq_init(gt); + + xe_force_wake_init_engines(gt, gt_to_fw(gt)); + + err = all_fw_domain_init(gt); + if (err) + return err; + + err = drmm_add_action_or_reset(>_to_xe(gt)->drm, gt_fini, gt); + if (err) + return err; + + return 0; +} + +static int do_gt_reset(struct xe_gt *gt) +{ + int err; + + xe_gsc_wa_14015076503(gt, true); + + xe_mmio_write32(gt, GDRST, GRDOM_FULL); + err = xe_mmio_wait32(gt, GDRST, GRDOM_FULL, 0, 5000, NULL, false); + if (err) + xe_gt_err(gt, "failed to clear GRDOM_FULL (%pe)\n", + ERR_PTR(err)); + + xe_gsc_wa_14015076503(gt, false); + + return err; +} + +static int do_gt_restart(struct xe_gt *gt) +{ + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + int err; + + xe_pat_init(gt); + + xe_gt_mcr_set_implicit_defaults(gt); + xe_reg_sr_apply_mmio(>->reg_sr, gt); + + err = xe_wopcm_init(>->uc.wopcm); + if (err) + return err; + + for_each_hw_engine(hwe, gt, id) + xe_hw_engine_enable_ring(hwe); + + err = xe_uc_sanitize_reset(>->uc); + if (err) + return err; + + err = xe_uc_init_hw(>->uc); + if (err) + return err; + + if (IS_SRIOV_PF(gt_to_xe(gt)) && !xe_gt_is_media_type(gt)) + xe_lmtt_init_hw(>_to_tile(gt)->sriov.pf.lmtt); + + xe_mocs_init(gt); + err = xe_uc_start(>->uc); + if (err) + return err; + + for_each_hw_engine(hwe, gt, id) { + xe_reg_sr_apply_mmio(&hwe->reg_sr, gt); + xe_reg_sr_apply_whitelist(hwe); + } + + /* Get CCS mode in sync between sw/hw */ + xe_gt_apply_ccs_mode(gt); + + return 0; +} + +static int gt_reset(struct xe_gt *gt) +{ + int err; + + /* We only support GT resets with GuC submission */ + if (!xe_device_uc_enabled(gt_to_xe(gt))) + return -ENODEV; + + xe_gt_info(gt, "reset started\n"); + + if (xe_fault_inject_gt_reset()) { + err = -ECANCELED; + goto err_fail; + } + + xe_gt_sanitize(gt); + + xe_device_mem_access_get(gt_to_xe(gt)); + err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (err) + goto err_msg; + + xe_uc_gucrc_disable(>->uc); + xe_uc_stop_prepare(>->uc); + xe_gt_pagefault_reset(gt); + + err = xe_uc_stop(>->uc); + if (err) + goto err_out; + + err = do_gt_reset(gt); + if (err) + goto err_out; + + xe_gt_tlb_invalidation_reset(gt); + + err = do_gt_restart(gt); + if (err) + goto err_out; + + err = xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL); + xe_device_mem_access_put(gt_to_xe(gt)); + XE_WARN_ON(err); + + xe_gt_info(gt, "reset done\n"); + + return 0; + +err_out: + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); +err_msg: + XE_WARN_ON(xe_uc_start(>->uc)); + xe_device_mem_access_put(gt_to_xe(gt)); +err_fail: + xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err)); + + gt_to_xe(gt)->needs_flr_on_fini = true; + + return err; +} + +static void gt_reset_worker(struct work_struct *w) +{ + struct xe_gt *gt = container_of(w, typeof(*gt), reset.worker); + + gt_reset(gt); +} + +void xe_gt_reset_async(struct xe_gt *gt) +{ + xe_gt_info(gt, "trying reset\n"); + + /* Don't do a reset while one is already in flight */ + if (!xe_fault_inject_gt_reset() && xe_uc_reset_prepare(>->uc)) + return; + + xe_gt_info(gt, "reset queued\n"); + queue_work(gt->ordered_wq, >->reset.worker); +} + +void xe_gt_suspend_prepare(struct xe_gt *gt) +{ + xe_device_mem_access_get(gt_to_xe(gt)); + XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + + xe_uc_stop_prepare(>->uc); + + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + xe_device_mem_access_put(gt_to_xe(gt)); +} + +int xe_gt_suspend(struct xe_gt *gt) +{ + int err; + + xe_gt_sanitize(gt); + + xe_device_mem_access_get(gt_to_xe(gt)); + err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (err) + goto err_msg; + + err = xe_uc_suspend(>->uc); + if (err) + goto err_force_wake; + + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + xe_device_mem_access_put(gt_to_xe(gt)); + xe_gt_info(gt, "suspended\n"); + + return 0; + +err_force_wake: + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); +err_msg: + xe_device_mem_access_put(gt_to_xe(gt)); + xe_gt_err(gt, "suspend failed (%pe)\n", ERR_PTR(err)); + + return err; +} + +int xe_gt_resume(struct xe_gt *gt) +{ + int err; + + xe_device_mem_access_get(gt_to_xe(gt)); + err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (err) + goto err_msg; + + err = do_gt_restart(gt); + if (err) + goto err_force_wake; + + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + xe_device_mem_access_put(gt_to_xe(gt)); + xe_gt_info(gt, "resumed\n"); + + return 0; + +err_force_wake: + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); +err_msg: + xe_device_mem_access_put(gt_to_xe(gt)); + xe_gt_err(gt, "resume failed (%pe)\n", ERR_PTR(err)); + + return err; +} + +struct xe_hw_engine *xe_gt_hw_engine(struct xe_gt *gt, + enum xe_engine_class class, + u16 instance, bool logical) +{ + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + + for_each_hw_engine(hwe, gt, id) + if (hwe->class == class && + ((!logical && hwe->instance == instance) || + (logical && hwe->logical_instance == instance))) + return hwe; + + return NULL; +} + +struct xe_hw_engine *xe_gt_any_hw_engine_by_reset_domain(struct xe_gt *gt, + enum xe_engine_class class) +{ + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + + for_each_hw_engine(hwe, gt, id) { + switch (class) { + case XE_ENGINE_CLASS_RENDER: + case XE_ENGINE_CLASS_COMPUTE: + if (hwe->class == XE_ENGINE_CLASS_RENDER || + hwe->class == XE_ENGINE_CLASS_COMPUTE) + return hwe; + break; + default: + if (hwe->class == class) + return hwe; + } + } + + return NULL; +} diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h new file mode 100644 index 000000000000..4486e083f5ef --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GT_H_ +#define _XE_GT_H_ + +#include <drm/drm_util.h> + +#include "xe_device_types.h" +#include "xe_hw_engine.h" + +#define for_each_hw_engine(hwe__, gt__, id__) \ + for ((id__) = 0; (id__) < ARRAY_SIZE((gt__)->hw_engines); (id__)++) \ + for_each_if(((hwe__) = (gt__)->hw_engines + (id__)) && \ + xe_hw_engine_is_valid((hwe__))) + +#define CCS_MASK(gt) (((gt)->info.engine_mask & XE_HW_ENGINE_CCS_MASK) >> XE_HW_ENGINE_CCS0) + +#ifdef CONFIG_FAULT_INJECTION +#include <linux/fault-inject.h> /* XXX: fault-inject.h is broken */ +extern struct fault_attr gt_reset_failure; +static inline bool xe_fault_inject_gt_reset(void) +{ + return should_fail(>_reset_failure, 1); +} +#else +static inline bool xe_fault_inject_gt_reset(void) +{ + return false; +} +#endif + +struct xe_gt *xe_gt_alloc(struct xe_tile *tile); +int xe_gt_init_early(struct xe_gt *gt); +int xe_gt_init(struct xe_gt *gt); +int xe_gt_record_default_lrcs(struct xe_gt *gt); +void xe_gt_suspend_prepare(struct xe_gt *gt); +int xe_gt_suspend(struct xe_gt *gt); +int xe_gt_resume(struct xe_gt *gt); +void xe_gt_reset_async(struct xe_gt *gt); +void xe_gt_sanitize(struct xe_gt *gt); + +/** + * xe_gt_any_hw_engine_by_reset_domain - scan the list of engines and return the + * first that matches the same reset domain as @class + * @gt: GT structure + * @class: hw engine class to lookup + */ +struct xe_hw_engine * +xe_gt_any_hw_engine_by_reset_domain(struct xe_gt *gt, enum xe_engine_class class); + +struct xe_hw_engine *xe_gt_hw_engine(struct xe_gt *gt, + enum xe_engine_class class, + u16 instance, + bool logical); + +static inline bool xe_gt_is_media_type(struct xe_gt *gt) +{ + return gt->info.type == XE_GT_TYPE_MEDIA; +} + +static inline bool xe_gt_is_usm_hwe(struct xe_gt *gt, struct xe_hw_engine *hwe) +{ + struct xe_device *xe = gt_to_xe(gt); + + return xe->info.has_usm && hwe->class == XE_ENGINE_CLASS_COPY && + hwe->instance == gt->usm.reserved_bcs_instance; +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_ccs_mode.c b/drivers/gpu/drm/xe/xe_gt_ccs_mode.c new file mode 100644 index 000000000000..529fc286cd06 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_ccs_mode.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <drm/drm_managed.h> + +#include "regs/xe_gt_regs.h" +#include "xe_assert.h" +#include "xe_gt.h" +#include "xe_gt_ccs_mode.h" +#include "xe_gt_sysfs.h" +#include "xe_mmio.h" + +static void __xe_gt_apply_ccs_mode(struct xe_gt *gt, u32 num_engines) +{ + u32 mode = CCS_MODE_CSLICE_0_3_MASK; /* disable all by default */ + int num_slices = hweight32(CCS_MASK(gt)); + struct xe_device *xe = gt_to_xe(gt); + int width, cslice = 0; + u32 config = 0; + + xe_assert(xe, xe_gt_ccs_mode_enabled(gt)); + + xe_assert(xe, num_engines && num_engines <= num_slices); + xe_assert(xe, !(num_slices % num_engines)); + + /* + * Loop over all available slices and assign each a user engine. + * For example, if there are four compute slices available, the + * assignment of compute slices to compute engines would be, + * + * With 1 engine (ccs0): + * slice 0, 1, 2, 3: ccs0 + * + * With 2 engines (ccs0, ccs1): + * slice 0, 2: ccs0 + * slice 1, 3: ccs1 + * + * With 4 engines (ccs0, ccs1, ccs2, ccs3): + * slice 0: ccs0 + * slice 1: ccs1 + * slice 2: ccs2 + * slice 3: ccs3 + */ + for (width = num_slices / num_engines; width; width--) { + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + + for_each_hw_engine(hwe, gt, id) { + if (hwe->class != XE_ENGINE_CLASS_COMPUTE) + continue; + + if (hwe->logical_instance >= num_engines) + break; + + config |= BIT(hwe->instance) << XE_HW_ENGINE_CCS0; + + /* If a slice is fused off, leave disabled */ + while ((CCS_MASK(gt) & BIT(cslice)) == 0) + cslice++; + + mode &= ~CCS_MODE_CSLICE(cslice, CCS_MODE_CSLICE_MASK); + mode |= CCS_MODE_CSLICE(cslice, hwe->instance); + cslice++; + } + } + + xe_mmio_write32(gt, CCS_MODE, mode); + + xe_gt_info(gt, "CCS_MODE=%x config:%08x, num_engines:%d, num_slices:%d\n", + mode, config, num_engines, num_slices); +} + +void xe_gt_apply_ccs_mode(struct xe_gt *gt) +{ + if (!gt->ccs_mode) + return; + + __xe_gt_apply_ccs_mode(gt, gt->ccs_mode); +} + +static ssize_t +num_cslices_show(struct device *kdev, + struct device_attribute *attr, char *buf) +{ + struct xe_gt *gt = kobj_to_gt(&kdev->kobj); + + return sysfs_emit(buf, "%u\n", hweight32(CCS_MASK(gt))); +} + +static DEVICE_ATTR_RO(num_cslices); + +static ssize_t +ccs_mode_show(struct device *kdev, + struct device_attribute *attr, char *buf) +{ + struct xe_gt *gt = kobj_to_gt(&kdev->kobj); + + return sysfs_emit(buf, "%u\n", gt->ccs_mode); +} + +static ssize_t +ccs_mode_store(struct device *kdev, struct device_attribute *attr, + const char *buff, size_t count) +{ + struct xe_gt *gt = kobj_to_gt(&kdev->kobj); + struct xe_device *xe = gt_to_xe(gt); + u32 num_engines, num_slices; + int ret; + + ret = kstrtou32(buff, 0, &num_engines); + if (ret) + return ret; + + /* + * Ensure number of engines specified is valid and there is an + * exact multiple of engines for slices. + */ + num_slices = hweight32(CCS_MASK(gt)); + if (!num_engines || num_engines > num_slices || num_slices % num_engines) { + xe_gt_dbg(gt, "Invalid compute config, %d engines %d slices\n", + num_engines, num_slices); + return -EINVAL; + } + + /* CCS mode can only be updated when there are no drm clients */ + spin_lock(&xe->clients.lock); + if (xe->clients.count) { + spin_unlock(&xe->clients.lock); + return -EBUSY; + } + + if (gt->ccs_mode != num_engines) { + xe_gt_info(gt, "Setting compute mode to %d\n", num_engines); + gt->ccs_mode = num_engines; + xe_gt_reset_async(gt); + } + + spin_unlock(&xe->clients.lock); + + return count; +} + +static DEVICE_ATTR_RW(ccs_mode); + +static const struct attribute *gt_ccs_mode_attrs[] = { + &dev_attr_ccs_mode.attr, + &dev_attr_num_cslices.attr, + NULL, +}; + +static void xe_gt_ccs_mode_sysfs_fini(struct drm_device *drm, void *arg) +{ + struct xe_gt *gt = arg; + + sysfs_remove_files(gt->sysfs, gt_ccs_mode_attrs); +} + +/** + * xe_gt_ccs_mode_sysfs_init - Initialize CCS mode sysfs interfaces + * @gt: GT structure + * + * Through a per-gt 'ccs_mode' sysfs interface, the user can enable a fixed + * number of compute hardware engines to which the available compute slices + * are to be allocated. This user configuration change triggers a gt reset + * and it is expected that there are no open drm clients while doing so. + * The number of available compute slices is exposed to user through a per-gt + * 'num_cslices' sysfs interface. + */ +void xe_gt_ccs_mode_sysfs_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + int err; + + if (!xe_gt_ccs_mode_enabled(gt)) + return; + + err = sysfs_create_files(gt->sysfs, gt_ccs_mode_attrs); + if (err) { + drm_warn(&xe->drm, "Sysfs creation for ccs_mode failed err: %d\n", err); + return; + } + + err = drmm_add_action_or_reset(&xe->drm, xe_gt_ccs_mode_sysfs_fini, gt); + if (err) { + sysfs_remove_files(gt->sysfs, gt_ccs_mode_attrs); + drm_warn(&xe->drm, "%s: drmm_add_action_or_reset failed, err: %d\n", + __func__, err); + } +} diff --git a/drivers/gpu/drm/xe/xe_gt_ccs_mode.h b/drivers/gpu/drm/xe/xe_gt_ccs_mode.h new file mode 100644 index 000000000000..f39975aaaab0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_ccs_mode.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GT_CCS_MODE_H_ +#define _XE_GT_CCS_MODE_H_ + +#include "xe_device_types.h" +#include "xe_gt.h" +#include "xe_gt_types.h" +#include "xe_platform_types.h" + +void xe_gt_apply_ccs_mode(struct xe_gt *gt); +void xe_gt_ccs_mode_sysfs_init(struct xe_gt *gt); + +static inline bool xe_gt_ccs_mode_enabled(const struct xe_gt *gt) +{ + /* Check if there are more than one compute engines available */ + return hweight32(CCS_MASK(gt)) > 1; +} + +#endif + diff --git a/drivers/gpu/drm/xe/xe_gt_clock.c b/drivers/gpu/drm/xe/xe_gt_clock.c new file mode 100644 index 000000000000..937054e31d72 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_clock.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_gt_clock.h" + +#include "regs/xe_gt_regs.h" +#include "regs/xe_regs.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_macros.h" +#include "xe_mmio.h" + +static u32 read_reference_ts_freq(struct xe_gt *gt) +{ + u32 ts_override = xe_mmio_read32(gt, TIMESTAMP_OVERRIDE); + u32 base_freq, frac_freq; + + base_freq = REG_FIELD_GET(TIMESTAMP_OVERRIDE_US_COUNTER_DIVIDER_MASK, + ts_override) + 1; + base_freq *= 1000000; + + frac_freq = REG_FIELD_GET(TIMESTAMP_OVERRIDE_US_COUNTER_DENOMINATOR_MASK, + ts_override); + frac_freq = 1000000 / (frac_freq + 1); + + return base_freq + frac_freq; +} + +static u32 get_crystal_clock_freq(u32 rpm_config_reg) +{ + const u32 f19_2_mhz = 19200000; + const u32 f24_mhz = 24000000; + const u32 f25_mhz = 25000000; + const u32 f38_4_mhz = 38400000; + u32 crystal_clock = REG_FIELD_GET(RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_MASK, + rpm_config_reg); + + switch (crystal_clock) { + case RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_24_MHZ: + return f24_mhz; + case RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_19_2_MHZ: + return f19_2_mhz; + case RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_38_4_MHZ: + return f38_4_mhz; + case RPM_CONFIG0_CRYSTAL_CLOCK_FREQ_25_MHZ: + return f25_mhz; + default: + XE_WARN_ON("NOT_POSSIBLE"); + return 0; + } +} + +int xe_gt_clock_init(struct xe_gt *gt) +{ + u32 ctc_reg = xe_mmio_read32(gt, CTC_MODE); + u32 freq = 0; + + /* Assuming gen11+ so assert this assumption is correct */ + xe_gt_assert(gt, GRAPHICS_VER(gt_to_xe(gt)) >= 11); + + if (ctc_reg & CTC_SOURCE_DIVIDE_LOGIC) { + freq = read_reference_ts_freq(gt); + } else { + u32 c0 = xe_mmio_read32(gt, RPM_CONFIG0); + + freq = get_crystal_clock_freq(c0); + + /* + * Now figure out how the command stream's timestamp + * register increments from this frequency (it might + * increment only every few clock cycle). + */ + freq >>= 3 - REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, c0); + } + + gt->info.reference_clock = freq; + return 0; +} + +u64 xe_gt_clock_cycles_to_ns(const struct xe_gt *gt, u64 count) +{ + return DIV_ROUND_CLOSEST_ULL(count * NSEC_PER_SEC, gt->info.reference_clock); +} diff --git a/drivers/gpu/drm/xe/xe_gt_clock.h b/drivers/gpu/drm/xe/xe_gt_clock.h new file mode 100644 index 000000000000..aa162722f859 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_clock.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GT_CLOCK_H_ +#define _XE_GT_CLOCK_H_ + +#include <linux/types.h> + +struct xe_gt; + +int xe_gt_clock_init(struct xe_gt *gt); +u64 xe_gt_clock_cycles_to_ns(const struct xe_gt *gt, u64 count); +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c b/drivers/gpu/drm/xe/xe_gt_debugfs.c new file mode 100644 index 000000000000..c4b67cf09f8f --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_gt_debugfs.h" + +#include <drm/drm_debugfs.h> +#include <drm/drm_managed.h> + +#include "xe_device.h" +#include "xe_force_wake.h" +#include "xe_ggtt.h" +#include "xe_gt.h" +#include "xe_gt_mcr.h" +#include "xe_gt_topology.h" +#include "xe_hw_engine.h" +#include "xe_lrc.h" +#include "xe_macros.h" +#include "xe_pat.h" +#include "xe_reg_sr.h" +#include "xe_reg_whitelist.h" +#include "xe_uc_debugfs.h" +#include "xe_wa.h" + +static struct xe_gt *node_to_gt(struct drm_info_node *node) +{ + return node->info_ent->data; +} + +static int hw_engines(struct seq_file *m, void *data) +{ + struct xe_gt *gt = node_to_gt(m->private); + struct xe_device *xe = gt_to_xe(gt); + struct drm_printer p = drm_seq_file_printer(m); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + int err; + + xe_device_mem_access_get(xe); + err = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (err) { + xe_device_mem_access_put(xe); + return err; + } + + for_each_hw_engine(hwe, gt, id) + xe_hw_engine_print(hwe, &p); + + err = xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL); + xe_device_mem_access_put(xe); + if (err) + return err; + + return 0; +} + +static int force_reset(struct seq_file *m, void *data) +{ + struct xe_gt *gt = node_to_gt(m->private); + + xe_gt_reset_async(gt); + + return 0; +} + +static int sa_info(struct seq_file *m, void *data) +{ + struct xe_tile *tile = gt_to_tile(node_to_gt(m->private)); + struct drm_printer p = drm_seq_file_printer(m); + + drm_suballoc_dump_debug_info(&tile->mem.kernel_bb_pool->base, &p, + tile->mem.kernel_bb_pool->gpu_addr); + + return 0; +} + +static int topology(struct seq_file *m, void *data) +{ + struct xe_gt *gt = node_to_gt(m->private); + struct drm_printer p = drm_seq_file_printer(m); + + xe_gt_topology_dump(gt, &p); + + return 0; +} + +static int steering(struct seq_file *m, void *data) +{ + struct xe_gt *gt = node_to_gt(m->private); + struct drm_printer p = drm_seq_file_printer(m); + + xe_gt_mcr_steering_dump(gt, &p); + + return 0; +} + +static int ggtt(struct seq_file *m, void *data) +{ + struct xe_gt *gt = node_to_gt(m->private); + struct drm_printer p = drm_seq_file_printer(m); + + return xe_ggtt_dump(gt_to_tile(gt)->mem.ggtt, &p); +} + +static int register_save_restore(struct seq_file *m, void *data) +{ + struct xe_gt *gt = node_to_gt(m->private); + struct drm_printer p = drm_seq_file_printer(m); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + + xe_reg_sr_dump(>->reg_sr, &p); + drm_printf(&p, "\n"); + + drm_printf(&p, "Engine\n"); + for_each_hw_engine(hwe, gt, id) + xe_reg_sr_dump(&hwe->reg_sr, &p); + drm_printf(&p, "\n"); + + drm_printf(&p, "LRC\n"); + for_each_hw_engine(hwe, gt, id) + xe_reg_sr_dump(&hwe->reg_lrc, &p); + drm_printf(&p, "\n"); + + drm_printf(&p, "Whitelist\n"); + for_each_hw_engine(hwe, gt, id) + xe_reg_whitelist_dump(&hwe->reg_whitelist, &p); + + return 0; +} + +static int workarounds(struct seq_file *m, void *data) +{ + struct xe_gt *gt = node_to_gt(m->private); + struct drm_printer p = drm_seq_file_printer(m); + + xe_wa_dump(gt, &p); + + return 0; +} + +static int pat(struct seq_file *m, void *data) +{ + struct xe_gt *gt = node_to_gt(m->private); + struct drm_printer p = drm_seq_file_printer(m); + + xe_pat_dump(gt, &p); + + return 0; +} + +static int rcs_default_lrc(struct seq_file *m, void *data) +{ + struct drm_printer p = drm_seq_file_printer(m); + + xe_lrc_dump_default(&p, node_to_gt(m->private), XE_ENGINE_CLASS_RENDER); + return 0; +} + +static int ccs_default_lrc(struct seq_file *m, void *data) +{ + struct drm_printer p = drm_seq_file_printer(m); + + xe_lrc_dump_default(&p, node_to_gt(m->private), XE_ENGINE_CLASS_COMPUTE); + return 0; +} + +static int bcs_default_lrc(struct seq_file *m, void *data) +{ + struct drm_printer p = drm_seq_file_printer(m); + + xe_lrc_dump_default(&p, node_to_gt(m->private), XE_ENGINE_CLASS_COPY); + return 0; +} + +static int vcs_default_lrc(struct seq_file *m, void *data) +{ + struct drm_printer p = drm_seq_file_printer(m); + + xe_lrc_dump_default(&p, node_to_gt(m->private), XE_ENGINE_CLASS_VIDEO_DECODE); + return 0; +} + +static int vecs_default_lrc(struct seq_file *m, void *data) +{ + struct drm_printer p = drm_seq_file_printer(m); + + xe_lrc_dump_default(&p, node_to_gt(m->private), XE_ENGINE_CLASS_VIDEO_ENHANCE); + return 0; +} + +static const struct drm_info_list debugfs_list[] = { + {"hw_engines", hw_engines, 0}, + {"force_reset", force_reset, 0}, + {"sa_info", sa_info, 0}, + {"topology", topology, 0}, + {"steering", steering, 0}, + {"ggtt", ggtt, 0}, + {"register-save-restore", register_save_restore, 0}, + {"workarounds", workarounds, 0}, + {"pat", pat, 0}, + {"default_lrc_rcs", rcs_default_lrc}, + {"default_lrc_ccs", ccs_default_lrc}, + {"default_lrc_bcs", bcs_default_lrc}, + {"default_lrc_vcs", vcs_default_lrc}, + {"default_lrc_vecs", vecs_default_lrc}, +}; + +void xe_gt_debugfs_register(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + struct drm_minor *minor = gt_to_xe(gt)->drm.primary; + struct dentry *root; + struct drm_info_list *local; + char name[8]; + int i; + + xe_gt_assert(gt, minor->debugfs_root); + + sprintf(name, "gt%d", gt->info.id); + root = debugfs_create_dir(name, minor->debugfs_root); + if (IS_ERR(root)) { + drm_warn(&xe->drm, "Create GT directory failed"); + return; + } + + /* + * Allocate local copy as we need to pass in the GT to the debugfs + * entry and drm_debugfs_create_files just references the drm_info_list + * passed in (e.g. can't define this on the stack). + */ +#define DEBUGFS_SIZE (ARRAY_SIZE(debugfs_list) * sizeof(struct drm_info_list)) + local = drmm_kmalloc(&xe->drm, DEBUGFS_SIZE, GFP_KERNEL); + if (!local) + return; + + memcpy(local, debugfs_list, DEBUGFS_SIZE); +#undef DEBUGFS_SIZE + + for (i = 0; i < ARRAY_SIZE(debugfs_list); ++i) + local[i].data = gt; + + drm_debugfs_create_files(local, + ARRAY_SIZE(debugfs_list), + root, minor); + + xe_uc_debugfs_register(>->uc, root); +} diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.h b/drivers/gpu/drm/xe/xe_gt_debugfs.h new file mode 100644 index 000000000000..5a329f118a57 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_debugfs.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GT_DEBUGFS_H_ +#define _XE_GT_DEBUGFS_H_ + +struct xe_gt; + +void xe_gt_debugfs_register(struct xe_gt *gt); + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_freq.c b/drivers/gpu/drm/xe/xe_gt_freq.c new file mode 100644 index 000000000000..3adfa6686e7c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_freq.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_gt_freq.h" + +#include <linux/kobject.h> +#include <linux/sysfs.h> + +#include <drm/drm_managed.h> +#include <drm/drm_print.h> + +#include "xe_device_types.h" +#include "xe_gt_sysfs.h" +#include "xe_gt_throttle_sysfs.h" +#include "xe_guc_pc.h" + +/** + * DOC: Xe GT Frequency Management + * + * This component is responsible for the raw GT frequency management, including + * the sysfs API. + * + * Underneath, Xe enables GuC SLPC automated frequency management. GuC is then + * allowed to request PCODE any frequency between the Minimum and the Maximum + * selected by this component. Furthermore, it is important to highlight that + * PCODE is the ultimate decision maker of the actual running frequency, based + * on thermal and other running conditions. + * + * Xe's Freq provides a sysfs API for frequency management: + * + * device/tile#/gt#/freq0/<item>_freq *read-only* files: + * - act_freq: The actual resolved frequency decided by PCODE. + * - cur_freq: The current one requested by GuC PC to the PCODE. + * - rpn_freq: The Render Performance (RP) N level, which is the minimal one. + * - rpe_freq: The Render Performance (RP) E level, which is the efficient one. + * - rp0_freq: The Render Performance (RP) 0 level, which is the maximum one. + * + * device/tile#/gt#/freq0/<item>_freq *read-write* files: + * - min_freq: Min frequency request. + * - max_freq: Max frequency request. + * If max <= min, then freq_min becomes a fixed frequency request. + */ + +static struct xe_guc_pc * +dev_to_pc(struct device *dev) +{ + return &kobj_to_gt(dev->kobj.parent)->uc.guc.pc; +} + +static ssize_t act_freq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct xe_guc_pc *pc = dev_to_pc(dev); + + return sysfs_emit(buf, "%d\n", xe_guc_pc_get_act_freq(pc)); +} +static DEVICE_ATTR_RO(act_freq); + +static ssize_t cur_freq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct xe_guc_pc *pc = dev_to_pc(dev); + u32 freq; + ssize_t ret; + + ret = xe_guc_pc_get_cur_freq(pc, &freq); + if (ret) + return ret; + + return sysfs_emit(buf, "%d\n", freq); +} +static DEVICE_ATTR_RO(cur_freq); + +static ssize_t rp0_freq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct xe_guc_pc *pc = dev_to_pc(dev); + + return sysfs_emit(buf, "%d\n", xe_guc_pc_get_rp0_freq(pc)); +} +static DEVICE_ATTR_RO(rp0_freq); + +static ssize_t rpe_freq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct xe_guc_pc *pc = dev_to_pc(dev); + + return sysfs_emit(buf, "%d\n", xe_guc_pc_get_rpe_freq(pc)); +} +static DEVICE_ATTR_RO(rpe_freq); + +static ssize_t rpn_freq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct xe_guc_pc *pc = dev_to_pc(dev); + + return sysfs_emit(buf, "%d\n", xe_guc_pc_get_rpn_freq(pc)); +} +static DEVICE_ATTR_RO(rpn_freq); + +static ssize_t min_freq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct xe_guc_pc *pc = dev_to_pc(dev); + u32 freq; + ssize_t ret; + + ret = xe_guc_pc_get_min_freq(pc, &freq); + if (ret) + return ret; + + return sysfs_emit(buf, "%d\n", freq); +} + +static ssize_t min_freq_store(struct device *dev, struct device_attribute *attr, + const char *buff, size_t count) +{ + struct xe_guc_pc *pc = dev_to_pc(dev); + u32 freq; + ssize_t ret; + + ret = kstrtou32(buff, 0, &freq); + if (ret) + return ret; + + ret = xe_guc_pc_set_min_freq(pc, freq); + if (ret) + return ret; + + return count; +} +static DEVICE_ATTR_RW(min_freq); + +static ssize_t max_freq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct xe_guc_pc *pc = dev_to_pc(dev); + u32 freq; + ssize_t ret; + + ret = xe_guc_pc_get_max_freq(pc, &freq); + if (ret) + return ret; + + return sysfs_emit(buf, "%d\n", freq); +} + +static ssize_t max_freq_store(struct device *dev, struct device_attribute *attr, + const char *buff, size_t count) +{ + struct xe_guc_pc *pc = dev_to_pc(dev); + u32 freq; + ssize_t ret; + + ret = kstrtou32(buff, 0, &freq); + if (ret) + return ret; + + ret = xe_guc_pc_set_max_freq(pc, freq); + if (ret) + return ret; + + return count; +} +static DEVICE_ATTR_RW(max_freq); + +static const struct attribute *freq_attrs[] = { + &dev_attr_act_freq.attr, + &dev_attr_cur_freq.attr, + &dev_attr_rp0_freq.attr, + &dev_attr_rpe_freq.attr, + &dev_attr_rpn_freq.attr, + &dev_attr_min_freq.attr, + &dev_attr_max_freq.attr, + NULL +}; + +static void freq_fini(struct drm_device *drm, void *arg) +{ + struct kobject *kobj = arg; + + sysfs_remove_files(kobj, freq_attrs); + kobject_put(kobj); +} + +/** + * xe_gt_freq_init - Initialize Xe Freq component + * @gt: Xe GT object + * + * It needs to be initialized after GT Sysfs and GuC PC components are ready. + */ +void xe_gt_freq_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + int err; + + gt->freq = kobject_create_and_add("freq0", gt->sysfs); + if (!gt->freq) { + drm_warn(&xe->drm, "failed to add freq0 directory to %s\n", + kobject_name(gt->sysfs)); + return; + } + + err = drmm_add_action_or_reset(&xe->drm, freq_fini, gt->freq); + if (err) { + drm_warn(&xe->drm, "%s: drmm_add_action_or_reset failed, err: %d\n", + __func__, err); + return; + } + + err = sysfs_create_files(gt->freq, freq_attrs); + if (err) + drm_warn(&xe->drm, "failed to add freq attrs to %s, err: %d\n", + kobject_name(gt->freq), err); + + xe_gt_throttle_sysfs_init(gt); +} diff --git a/drivers/gpu/drm/xe/xe_gt_freq.h b/drivers/gpu/drm/xe/xe_gt_freq.h new file mode 100644 index 000000000000..f3fe3c90491a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_freq.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GT_FREQ_H_ +#define _XE_GT_FREQ_H_ + +struct xe_gt; + +void xe_gt_freq_init(struct xe_gt *gt); + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_idle.c b/drivers/gpu/drm/xe/xe_gt_idle.c new file mode 100644 index 000000000000..9358f7336889 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_idle.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <drm/drm_managed.h> + +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_gt_idle.h" +#include "xe_gt_sysfs.h" +#include "xe_guc_pc.h" +#include "regs/xe_gt_regs.h" +#include "xe_mmio.h" + +/** + * DOC: Xe GT Idle + * + * Contains functions that init GT idle features like C6 + * + * device/gt#/gtidle/name - name of the state + * device/gt#/gtidle/idle_residency_ms - Provides residency of the idle state in ms + * device/gt#/gtidle/idle_status - Provides current idle state + */ + +static struct xe_gt_idle *dev_to_gtidle(struct device *dev) +{ + struct kobject *kobj = &dev->kobj; + + return &kobj_to_gt(kobj->parent)->gtidle; +} + +static struct xe_gt *gtidle_to_gt(struct xe_gt_idle *gtidle) +{ + return container_of(gtidle, struct xe_gt, gtidle); +} + +static struct xe_guc_pc *gtidle_to_pc(struct xe_gt_idle *gtidle) +{ + return >idle_to_gt(gtidle)->uc.guc.pc; +} + +static const char *gt_idle_state_to_string(enum xe_gt_idle_state state) +{ + switch (state) { + case GT_IDLE_C0: + return "gt-c0"; + case GT_IDLE_C6: + return "gt-c6"; + default: + return "unknown"; + } +} + +static u64 get_residency_ms(struct xe_gt_idle *gtidle, u64 cur_residency) +{ + u64 delta, overflow_residency, prev_residency; + + overflow_residency = BIT_ULL(32); + + /* + * Counter wrap handling + * Store previous hw counter values for counter wrap-around handling + * Relying on sufficient frequency of queries otherwise counters can still wrap. + */ + prev_residency = gtidle->prev_residency; + gtidle->prev_residency = cur_residency; + + /* delta */ + if (cur_residency >= prev_residency) + delta = cur_residency - prev_residency; + else + delta = cur_residency + (overflow_residency - prev_residency); + + /* Add delta to extended raw driver copy of idle residency */ + cur_residency = gtidle->cur_residency + delta; + gtidle->cur_residency = cur_residency; + + /* residency multiplier in ns, convert to ms */ + cur_residency = mul_u64_u32_div(cur_residency, gtidle->residency_multiplier, 1e6); + + return cur_residency; +} + +static ssize_t name_show(struct device *dev, + struct device_attribute *attr, char *buff) +{ + struct xe_gt_idle *gtidle = dev_to_gtidle(dev); + + return sysfs_emit(buff, "%s\n", gtidle->name); +} +static DEVICE_ATTR_RO(name); + +static ssize_t idle_status_show(struct device *dev, + struct device_attribute *attr, char *buff) +{ + struct xe_gt_idle *gtidle = dev_to_gtidle(dev); + struct xe_guc_pc *pc = gtidle_to_pc(gtidle); + enum xe_gt_idle_state state; + + state = gtidle->idle_status(pc); + + return sysfs_emit(buff, "%s\n", gt_idle_state_to_string(state)); +} +static DEVICE_ATTR_RO(idle_status); + +static ssize_t idle_residency_ms_show(struct device *dev, + struct device_attribute *attr, char *buff) +{ + struct xe_gt_idle *gtidle = dev_to_gtidle(dev); + struct xe_guc_pc *pc = gtidle_to_pc(gtidle); + u64 residency; + + residency = gtidle->idle_residency(pc); + return sysfs_emit(buff, "%llu\n", get_residency_ms(gtidle, residency)); +} +static DEVICE_ATTR_RO(idle_residency_ms); + +static const struct attribute *gt_idle_attrs[] = { + &dev_attr_name.attr, + &dev_attr_idle_status.attr, + &dev_attr_idle_residency_ms.attr, + NULL, +}; + +static void gt_idle_sysfs_fini(struct drm_device *drm, void *arg) +{ + struct kobject *kobj = arg; + + sysfs_remove_files(kobj, gt_idle_attrs); + kobject_put(kobj); +} + +void xe_gt_idle_sysfs_init(struct xe_gt_idle *gtidle) +{ + struct xe_gt *gt = gtidle_to_gt(gtidle); + struct xe_device *xe = gt_to_xe(gt); + struct kobject *kobj; + int err; + + kobj = kobject_create_and_add("gtidle", gt->sysfs); + if (!kobj) { + drm_warn(&xe->drm, "%s failed, err: %d\n", __func__, -ENOMEM); + return; + } + + if (xe_gt_is_media_type(gt)) { + sprintf(gtidle->name, "gt%d-mc\n", gt->info.id); + gtidle->idle_residency = xe_guc_pc_mc6_residency; + } else { + sprintf(gtidle->name, "gt%d-rc\n", gt->info.id); + gtidle->idle_residency = xe_guc_pc_rc6_residency; + } + + /* Multiplier for Residency counter in units of 1.28us */ + gtidle->residency_multiplier = 1280; + gtidle->idle_status = xe_guc_pc_c_status; + + err = sysfs_create_files(kobj, gt_idle_attrs); + if (err) { + kobject_put(kobj); + drm_warn(&xe->drm, "failed to register gtidle sysfs, err: %d\n", err); + return; + } + + err = drmm_add_action_or_reset(&xe->drm, gt_idle_sysfs_fini, kobj); + if (err) + drm_warn(&xe->drm, "%s: drmm_add_action_or_reset failed, err: %d\n", + __func__, err); +} + +void xe_gt_idle_enable_c6(struct xe_gt *gt) +{ + xe_device_assert_mem_access(gt_to_xe(gt)); + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + + /* Units of 1280 ns for a total of 5s */ + xe_mmio_write32(gt, RC_IDLE_HYSTERSIS, 0x3B9ACA); + /* Enable RC6 */ + xe_mmio_write32(gt, RC_CONTROL, + RC_CTL_HW_ENABLE | RC_CTL_TO_MODE | RC_CTL_RC6_ENABLE); +} + +void xe_gt_idle_disable_c6(struct xe_gt *gt) +{ + xe_device_assert_mem_access(gt_to_xe(gt)); + xe_force_wake_assert_held(gt_to_fw(gt), XE_FORCEWAKE_ALL); + + xe_mmio_write32(gt, PG_ENABLE, 0); + xe_mmio_write32(gt, RC_CONTROL, 0); + xe_mmio_write32(gt, RC_STATE, 0); +} diff --git a/drivers/gpu/drm/xe/xe_gt_idle.h b/drivers/gpu/drm/xe/xe_gt_idle.h new file mode 100644 index 000000000000..69280fd16b03 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_idle.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GT_IDLE_H_ +#define _XE_GT_IDLE_H_ + +#include "xe_gt_idle_types.h" + +struct xe_gt; + +void xe_gt_idle_sysfs_init(struct xe_gt_idle *gtidle); +void xe_gt_idle_enable_c6(struct xe_gt *gt); +void xe_gt_idle_disable_c6(struct xe_gt *gt); + +#endif /* _XE_GT_IDLE_H_ */ diff --git a/drivers/gpu/drm/xe/xe_gt_idle_types.h b/drivers/gpu/drm/xe/xe_gt_idle_types.h new file mode 100644 index 000000000000..f99b447534f3 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_idle_types.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GT_IDLE_SYSFS_TYPES_H_ +#define _XE_GT_IDLE_SYSFS_TYPES_H_ + +#include <linux/types.h> + +struct xe_guc_pc; + +/* States of GT Idle */ +enum xe_gt_idle_state { + GT_IDLE_C0, + GT_IDLE_C6, + GT_IDLE_UNKNOWN, +}; + +/** + * struct xe_gt_idle - A struct that contains idle properties based of gt + */ +struct xe_gt_idle { + /** @name: name */ + char name[16]; + /** @residency_multiplier: residency multiplier in ns */ + u32 residency_multiplier; + /** @cur_residency: raw driver copy of idle residency */ + u64 cur_residency; + /** @prev_residency: previous residency counter */ + u64 prev_residency; + /** @idle_status: get the current idle state */ + enum xe_gt_idle_state (*idle_status)(struct xe_guc_pc *pc); + /** @idle_residency: get idle residency counter */ + u64 (*idle_residency)(struct xe_guc_pc *pc); +}; + +#endif /* _XE_GT_IDLE_SYSFS_TYPES_H_ */ diff --git a/drivers/gpu/drm/xe/xe_gt_mcr.c b/drivers/gpu/drm/xe/xe_gt_mcr.c new file mode 100644 index 000000000000..77925b35cf8d --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_mcr.c @@ -0,0 +1,685 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_gt_mcr.h" + +#include "regs/xe_gt_regs.h" +#include "xe_gt.h" +#include "xe_gt_topology.h" +#include "xe_gt_types.h" +#include "xe_mmio.h" + +/** + * DOC: GT Multicast/Replicated (MCR) Register Support + * + * Some GT registers are designed as "multicast" or "replicated" registers: + * multiple instances of the same register share a single MMIO offset. MCR + * registers are generally used when the hardware needs to potentially track + * independent values of a register per hardware unit (e.g., per-subslice, + * per-L3bank, etc.). The specific types of replication that exist vary + * per-platform. + * + * MMIO accesses to MCR registers are controlled according to the settings + * programmed in the platform's MCR_SELECTOR register(s). MMIO writes to MCR + * registers can be done in either multicast (a single write updates all + * instances of the register to the same value) or unicast (a write updates only + * one specific instance) form. Reads of MCR registers always operate in a + * unicast manner regardless of how the multicast/unicast bit is set in + * MCR_SELECTOR. Selection of a specific MCR instance for unicast operations is + * referred to as "steering." + * + * If MCR register operations are steered toward a hardware unit that is + * fused off or currently powered down due to power gating, the MMIO operation + * is "terminated" by the hardware. Terminated read operations will return a + * value of zero and terminated unicast write operations will be silently + * ignored. During device initialization, the goal of the various + * ``init_steering_*()`` functions is to apply the platform-specific rules for + * each MCR register type to identify a steering target that will select a + * non-terminated instance. + */ + +#define STEER_SEMAPHORE XE_REG(0xFD0) + +static inline struct xe_reg to_xe_reg(struct xe_reg_mcr reg_mcr) +{ + return reg_mcr.__reg; +} + +enum { + MCR_OP_READ, + MCR_OP_WRITE +}; + +static const struct xe_mmio_range xelp_l3bank_steering_table[] = { + { 0x00B100, 0x00B3FF }, + {}, +}; + +static const struct xe_mmio_range xehp_l3bank_steering_table[] = { + { 0x008C80, 0x008CFF }, + { 0x00B100, 0x00B3FF }, + {}, +}; + +/* + * Although the bspec lists more "MSLICE" ranges than shown here, some of those + * are of a "GAM" subclass that has special rules and doesn't need to be + * included here. + */ +static const struct xe_mmio_range xehp_mslice_steering_table[] = { + { 0x00DD00, 0x00DDFF }, + { 0x00E900, 0x00FFFF }, /* 0xEA00 - OxEFFF is unused */ + {}, +}; + +static const struct xe_mmio_range xehp_lncf_steering_table[] = { + { 0x00B000, 0x00B0FF }, + { 0x00D880, 0x00D8FF }, + {}, +}; + +/* + * We have several types of MCR registers where steering to (0,0) will always + * provide us with a non-terminated value. We'll stick them all in the same + * table for simplicity. + */ +static const struct xe_mmio_range xehpc_instance0_steering_table[] = { + { 0x004000, 0x004AFF }, /* HALF-BSLICE */ + { 0x008800, 0x00887F }, /* CC */ + { 0x008A80, 0x008AFF }, /* TILEPSMI */ + { 0x00B000, 0x00B0FF }, /* HALF-BSLICE */ + { 0x00B100, 0x00B3FF }, /* L3BANK */ + { 0x00C800, 0x00CFFF }, /* HALF-BSLICE */ + { 0x00D800, 0x00D8FF }, /* HALF-BSLICE */ + { 0x00DD00, 0x00DDFF }, /* BSLICE */ + { 0x00E900, 0x00E9FF }, /* HALF-BSLICE */ + { 0x00EC00, 0x00EEFF }, /* HALF-BSLICE */ + { 0x00F000, 0x00FFFF }, /* HALF-BSLICE */ + { 0x024180, 0x0241FF }, /* HALF-BSLICE */ + {}, +}; + +static const struct xe_mmio_range xelpg_instance0_steering_table[] = { + { 0x000B00, 0x000BFF }, /* SQIDI */ + { 0x001000, 0x001FFF }, /* SQIDI */ + { 0x004000, 0x0048FF }, /* GAM */ + { 0x008700, 0x0087FF }, /* SQIDI */ + { 0x00B000, 0x00B0FF }, /* NODE */ + { 0x00C800, 0x00CFFF }, /* GAM */ + { 0x00D880, 0x00D8FF }, /* NODE */ + { 0x00DD00, 0x00DDFF }, /* OAAL2 */ + {}, +}; + +static const struct xe_mmio_range xelpg_l3bank_steering_table[] = { + { 0x00B100, 0x00B3FF }, + {}, +}; + +static const struct xe_mmio_range xelp_dss_steering_table[] = { + { 0x008150, 0x00815F }, + { 0x009520, 0x00955F }, + { 0x00DE80, 0x00E8FF }, + { 0x024A00, 0x024A7F }, + {}, +}; + +/* DSS steering is used for GSLICE ranges as well */ +static const struct xe_mmio_range xehp_dss_steering_table[] = { + { 0x005200, 0x0052FF }, /* GSLICE */ + { 0x005400, 0x007FFF }, /* GSLICE */ + { 0x008140, 0x00815F }, /* GSLICE (0x8140-0x814F), DSS (0x8150-0x815F) */ + { 0x008D00, 0x008DFF }, /* DSS */ + { 0x0094D0, 0x00955F }, /* GSLICE (0x94D0-0x951F), DSS (0x9520-0x955F) */ + { 0x009680, 0x0096FF }, /* DSS */ + { 0x00D800, 0x00D87F }, /* GSLICE */ + { 0x00DC00, 0x00DCFF }, /* GSLICE */ + { 0x00DE80, 0x00E8FF }, /* DSS (0xE000-0xE0FF reserved ) */ + { 0x017000, 0x017FFF }, /* GSLICE */ + { 0x024A00, 0x024A7F }, /* DSS */ + {}, +}; + +/* DSS steering is used for COMPUTE ranges as well */ +static const struct xe_mmio_range xehpc_dss_steering_table[] = { + { 0x008140, 0x00817F }, /* COMPUTE (0x8140-0x814F & 0x8160-0x817F), DSS (0x8150-0x815F) */ + { 0x0094D0, 0x00955F }, /* COMPUTE (0x94D0-0x951F), DSS (0x9520-0x955F) */ + { 0x009680, 0x0096FF }, /* DSS */ + { 0x00DC00, 0x00DCFF }, /* COMPUTE */ + { 0x00DE80, 0x00E7FF }, /* DSS (0xDF00-0xE1FF reserved ) */ + {}, +}; + +/* DSS steering is used for SLICE ranges as well */ +static const struct xe_mmio_range xelpg_dss_steering_table[] = { + { 0x005200, 0x0052FF }, /* SLICE */ + { 0x005500, 0x007FFF }, /* SLICE */ + { 0x008140, 0x00815F }, /* SLICE (0x8140-0x814F), DSS (0x8150-0x815F) */ + { 0x0094D0, 0x00955F }, /* SLICE (0x94D0-0x951F), DSS (0x9520-0x955F) */ + { 0x009680, 0x0096FF }, /* DSS */ + { 0x00D800, 0x00D87F }, /* SLICE */ + { 0x00DC00, 0x00DCFF }, /* SLICE */ + { 0x00DE80, 0x00E8FF }, /* DSS (0xE000-0xE0FF reserved) */ + {}, +}; + +static const struct xe_mmio_range xelpmp_oaddrm_steering_table[] = { + { 0x393200, 0x39323F }, + { 0x393400, 0x3934FF }, + {}, +}; + +static const struct xe_mmio_range dg2_implicit_steering_table[] = { + { 0x000B00, 0x000BFF }, /* SF (SQIDI replication) */ + { 0x001000, 0x001FFF }, /* SF (SQIDI replication) */ + { 0x004000, 0x004AFF }, /* GAM (MSLICE replication) */ + { 0x008700, 0x0087FF }, /* MCFG (SQIDI replication) */ + { 0x00C800, 0x00CFFF }, /* GAM (MSLICE replication) */ + { 0x00F000, 0x00FFFF }, /* GAM (MSLICE replication) */ + {}, +}; + +static const struct xe_mmio_range xe2lpg_dss_steering_table[] = { + { 0x005200, 0x0052FF }, /* SLICE */ + { 0x005500, 0x007FFF }, /* SLICE */ + { 0x008140, 0x00815F }, /* SLICE (0x8140-0x814F), DSS (0x8150-0x815F) */ + { 0x0094D0, 0x00955F }, /* SLICE (0x94D0-0x951F), DSS (0x9520-0x955F) */ + { 0x009680, 0x0096FF }, /* DSS */ + { 0x00D800, 0x00D87F }, /* SLICE */ + { 0x00DC00, 0x00DCFF }, /* SLICE */ + { 0x00DE80, 0x00E8FF }, /* DSS (0xE000-0xE0FF reserved) */ + { 0x00E980, 0x00E9FF }, /* SLICE */ + { 0x013000, 0x0133FF }, /* DSS (0x13000-0x131FF), SLICE (0x13200-0x133FF) */ + {}, +}; + +static const struct xe_mmio_range xe2lpg_sqidi_psmi_steering_table[] = { + { 0x000B00, 0x000BFF }, + { 0x001000, 0x001FFF }, + {}, +}; + +static const struct xe_mmio_range xe2lpg_instance0_steering_table[] = { + { 0x004000, 0x004AFF }, /* GAM, rsvd, GAMWKR */ + { 0x008700, 0x00887F }, /* SQIDI, MEMPIPE */ + { 0x00B000, 0x00B3FF }, /* NODE, L3BANK */ + { 0x00C800, 0x00CFFF }, /* GAM */ + { 0x00D880, 0x00D8FF }, /* NODE */ + { 0x00DD00, 0x00DDFF }, /* MEMPIPE */ + { 0x00E900, 0x00E97F }, /* MEMPIPE */ + { 0x00F000, 0x00FFFF }, /* GAM, GAMWKR */ + { 0x013400, 0x0135FF }, /* MEMPIPE */ + {}, +}; + +static const struct xe_mmio_range xe2lpm_gpmxmt_steering_table[] = { + { 0x388160, 0x38817F }, + { 0x389480, 0x3894CF }, + {}, +}; + +static const struct xe_mmio_range xe2lpm_instance0_steering_table[] = { + { 0x384000, 0x3847DF }, /* GAM, rsvd, GAM */ + { 0x384900, 0x384AFF }, /* GAM */ + { 0x389560, 0x3895FF }, /* MEDIAINF */ + { 0x38B600, 0x38B8FF }, /* L3BANK */ + { 0x38C800, 0x38D07F }, /* GAM, MEDIAINF */ + { 0x38F000, 0x38F0FF }, /* GAM */ + { 0x393C00, 0x393C7F }, /* MEDIAINF */ + {}, +}; + +static void init_steering_l3bank(struct xe_gt *gt) +{ + if (GRAPHICS_VERx100(gt_to_xe(gt)) >= 1270) { + u32 mslice_mask = REG_FIELD_GET(MEML3_EN_MASK, + xe_mmio_read32(gt, MIRROR_FUSE3)); + u32 bank_mask = REG_FIELD_GET(GT_L3_EXC_MASK, + xe_mmio_read32(gt, XEHP_FUSE4)); + + /* + * Group selects mslice, instance selects bank within mslice. + * Bank 0 is always valid _except_ when the bank mask is 010b. + */ + gt->steering[L3BANK].group_target = __ffs(mslice_mask); + gt->steering[L3BANK].instance_target = + bank_mask & BIT(0) ? 0 : 2; + } else if (gt_to_xe(gt)->info.platform == XE_DG2) { + u32 mslice_mask = REG_FIELD_GET(MEML3_EN_MASK, + xe_mmio_read32(gt, MIRROR_FUSE3)); + u32 bank = __ffs(mslice_mask) * 8; + + /* + * Like mslice registers, look for a valid mslice and steer to + * the first L3BANK of that quad. Access to the Nth L3 bank is + * split between the first bits of group and instance + */ + gt->steering[L3BANK].group_target = (bank >> 2) & 0x7; + gt->steering[L3BANK].instance_target = bank & 0x3; + } else { + u32 fuse = REG_FIELD_GET(L3BANK_MASK, + ~xe_mmio_read32(gt, MIRROR_FUSE3)); + + gt->steering[L3BANK].group_target = 0; /* unused */ + gt->steering[L3BANK].instance_target = __ffs(fuse); + } +} + +static void init_steering_mslice(struct xe_gt *gt) +{ + u32 mask = REG_FIELD_GET(MEML3_EN_MASK, + xe_mmio_read32(gt, MIRROR_FUSE3)); + + /* + * mslice registers are valid (not terminated) if either the meml3 + * associated with the mslice is present, or at least one DSS associated + * with the mslice is present. There will always be at least one meml3 + * so we can just use that to find a non-terminated mslice and ignore + * the DSS fusing. + */ + gt->steering[MSLICE].group_target = __ffs(mask); + gt->steering[MSLICE].instance_target = 0; /* unused */ + + /* + * LNCF termination is also based on mslice presence, so we'll set + * it up here. Either LNCF within a non-terminated mslice will work, + * so we just always pick LNCF 0 here. + */ + gt->steering[LNCF].group_target = __ffs(mask) << 1; + gt->steering[LNCF].instance_target = 0; /* unused */ +} + +static void init_steering_dss(struct xe_gt *gt) +{ + unsigned int dss = min(xe_dss_mask_group_ffs(gt->fuse_topo.g_dss_mask, 0, 0), + xe_dss_mask_group_ffs(gt->fuse_topo.c_dss_mask, 0, 0)); + unsigned int dss_per_grp = gt_to_xe(gt)->info.platform == XE_PVC ? 8 : 4; + + gt->steering[DSS].group_target = dss / dss_per_grp; + gt->steering[DSS].instance_target = dss % dss_per_grp; +} + +static void init_steering_oaddrm(struct xe_gt *gt) +{ + /* + * First instance is only terminated if the entire first media slice + * is absent (i.e., no VCS0 or VECS0). + */ + if (gt->info.engine_mask & (XE_HW_ENGINE_VCS0 | XE_HW_ENGINE_VECS0)) + gt->steering[OADDRM].group_target = 0; + else + gt->steering[OADDRM].group_target = 1; + + gt->steering[DSS].instance_target = 0; /* unused */ +} + +static void init_steering_sqidi_psmi(struct xe_gt *gt) +{ + u32 mask = REG_FIELD_GET(XE2_NODE_ENABLE_MASK, + xe_mmio_read32(gt, MIRROR_FUSE3)); + u32 select = __ffs(mask); + + gt->steering[SQIDI_PSMI].group_target = select >> 1; + gt->steering[SQIDI_PSMI].instance_target = select & 0x1; +} + +static void init_steering_inst0(struct xe_gt *gt) +{ + gt->steering[DSS].group_target = 0; /* unused */ + gt->steering[DSS].instance_target = 0; /* unused */ +} + +static const struct { + const char *name; + void (*init)(struct xe_gt *gt); +} xe_steering_types[] = { + [L3BANK] = { "L3BANK", init_steering_l3bank }, + [MSLICE] = { "MSLICE", init_steering_mslice }, + [LNCF] = { "LNCF", NULL }, /* initialized by mslice init */ + [DSS] = { "DSS", init_steering_dss }, + [OADDRM] = { "OADDRM / GPMXMT", init_steering_oaddrm }, + [SQIDI_PSMI] = { "SQIDI_PSMI", init_steering_sqidi_psmi }, + [INSTANCE0] = { "INSTANCE 0", init_steering_inst0 }, + [IMPLICIT_STEERING] = { "IMPLICIT", NULL }, +}; + +void xe_gt_mcr_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + + BUILD_BUG_ON(IMPLICIT_STEERING + 1 != NUM_STEERING_TYPES); + BUILD_BUG_ON(ARRAY_SIZE(xe_steering_types) != NUM_STEERING_TYPES); + + spin_lock_init(>->mcr_lock); + + if (gt->info.type == XE_GT_TYPE_MEDIA) { + drm_WARN_ON(&xe->drm, MEDIA_VER(xe) < 13); + + if (MEDIA_VER(xe) >= 20) { + gt->steering[OADDRM].ranges = xe2lpm_gpmxmt_steering_table; + gt->steering[INSTANCE0].ranges = xe2lpm_instance0_steering_table; + } else { + gt->steering[OADDRM].ranges = xelpmp_oaddrm_steering_table; + } + } else { + if (GRAPHICS_VER(xe) >= 20) { + gt->steering[DSS].ranges = xe2lpg_dss_steering_table; + gt->steering[SQIDI_PSMI].ranges = xe2lpg_sqidi_psmi_steering_table; + gt->steering[INSTANCE0].ranges = xe2lpg_instance0_steering_table; + } else if (GRAPHICS_VERx100(xe) >= 1270) { + gt->steering[INSTANCE0].ranges = xelpg_instance0_steering_table; + gt->steering[L3BANK].ranges = xelpg_l3bank_steering_table; + gt->steering[DSS].ranges = xelpg_dss_steering_table; + } else if (xe->info.platform == XE_PVC) { + gt->steering[INSTANCE0].ranges = xehpc_instance0_steering_table; + gt->steering[DSS].ranges = xehpc_dss_steering_table; + } else if (xe->info.platform == XE_DG2) { + gt->steering[L3BANK].ranges = xehp_l3bank_steering_table; + gt->steering[MSLICE].ranges = xehp_mslice_steering_table; + gt->steering[LNCF].ranges = xehp_lncf_steering_table; + gt->steering[DSS].ranges = xehp_dss_steering_table; + gt->steering[IMPLICIT_STEERING].ranges = dg2_implicit_steering_table; + } else { + gt->steering[L3BANK].ranges = xelp_l3bank_steering_table; + gt->steering[DSS].ranges = xelp_dss_steering_table; + } + } + + /* Select non-terminated steering target for each type */ + for (int i = 0; i < NUM_STEERING_TYPES; i++) + if (gt->steering[i].ranges && xe_steering_types[i].init) + xe_steering_types[i].init(gt); +} + +/** + * xe_gt_mcr_set_implicit_defaults - Initialize steer control registers + * @gt: GT structure + * + * Some register ranges don't need to have their steering control registers + * changed on each access - it's sufficient to set them once on initialization. + * This function sets those registers for each platform * + */ +void xe_gt_mcr_set_implicit_defaults(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + + if (xe->info.platform == XE_DG2) { + u32 steer_val = REG_FIELD_PREP(MCR_SLICE_MASK, 0) | + REG_FIELD_PREP(MCR_SUBSLICE_MASK, 2); + + xe_mmio_write32(gt, MCFG_MCR_SELECTOR, steer_val); + xe_mmio_write32(gt, SF_MCR_SELECTOR, steer_val); + /* + * For GAM registers, all reads should be directed to instance 1 + * (unicast reads against other instances are not allowed), + * and instance 1 is already the hardware's default steering + * target, which we never change + */ + } +} + +/* + * xe_gt_mcr_get_nonterminated_steering - find group/instance values that + * will steer a register to a non-terminated instance + * @gt: GT structure + * @reg: register for which the steering is required + * @group: return variable for group steering + * @instance: return variable for instance steering + * + * This function returns a group/instance pair that is guaranteed to work for + * read steering of the given register. Note that a value will be returned even + * if the register is not replicated and therefore does not actually require + * steering. + * + * Returns true if the caller should steer to the @group/@instance values + * returned. Returns false if the caller need not perform any steering + */ +static bool xe_gt_mcr_get_nonterminated_steering(struct xe_gt *gt, + struct xe_reg_mcr reg_mcr, + u8 *group, u8 *instance) +{ + const struct xe_reg reg = to_xe_reg(reg_mcr); + const struct xe_mmio_range *implicit_ranges; + + for (int type = 0; type < IMPLICIT_STEERING; type++) { + if (!gt->steering[type].ranges) + continue; + + for (int i = 0; gt->steering[type].ranges[i].end > 0; i++) { + if (xe_mmio_in_range(gt, >->steering[type].ranges[i], reg)) { + *group = gt->steering[type].group_target; + *instance = gt->steering[type].instance_target; + return true; + } + } + } + + implicit_ranges = gt->steering[IMPLICIT_STEERING].ranges; + if (implicit_ranges) + for (int i = 0; implicit_ranges[i].end > 0; i++) + if (xe_mmio_in_range(gt, &implicit_ranges[i], reg)) + return false; + + /* + * Not found in a steering table and not a register with implicit + * steering. Just steer to 0/0 as a guess and raise a warning. + */ + drm_WARN(>_to_xe(gt)->drm, true, + "Did not find MCR register %#x in any MCR steering table\n", + reg.addr); + *group = 0; + *instance = 0; + + return true; +} + +/* + * Obtain exclusive access to MCR steering. On MTL and beyond we also need + * to synchronize with external clients (e.g., firmware), so a semaphore + * register will also need to be taken. + */ +static void mcr_lock(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + int ret = 0; + + spin_lock(>->mcr_lock); + + /* + * Starting with MTL we also need to grab a semaphore register + * to synchronize with external agents (e.g., firmware) that now + * shares the same steering control register. The semaphore is obtained + * when a read to the relevant register returns 1. + */ + if (GRAPHICS_VERx100(xe) >= 1270) + ret = xe_mmio_wait32(gt, STEER_SEMAPHORE, 0x1, 0x1, 10, NULL, + true); + + drm_WARN_ON_ONCE(&xe->drm, ret == -ETIMEDOUT); +} + +static void mcr_unlock(struct xe_gt *gt) +{ + /* Release hardware semaphore - this is done by writing 1 to the register */ + if (GRAPHICS_VERx100(gt_to_xe(gt)) >= 1270) + xe_mmio_write32(gt, STEER_SEMAPHORE, 0x1); + + spin_unlock(>->mcr_lock); +} + +/* + * Access a register with specific MCR steering + * + * Caller needs to make sure the relevant forcewake wells are up. + */ +static u32 rw_with_mcr_steering(struct xe_gt *gt, struct xe_reg_mcr reg_mcr, + u8 rw_flag, int group, int instance, u32 value) +{ + const struct xe_reg reg = to_xe_reg(reg_mcr); + struct xe_reg steer_reg; + u32 steer_val, val = 0; + + lockdep_assert_held(>->mcr_lock); + + if (GRAPHICS_VERx100(gt_to_xe(gt)) >= 1270) { + steer_reg = MTL_MCR_SELECTOR; + steer_val = REG_FIELD_PREP(MTL_MCR_GROUPID, group) | + REG_FIELD_PREP(MTL_MCR_INSTANCEID, instance); + } else { + steer_reg = MCR_SELECTOR; + steer_val = REG_FIELD_PREP(MCR_SLICE_MASK, group) | + REG_FIELD_PREP(MCR_SUBSLICE_MASK, instance); + } + + /* + * Always leave the hardware in multicast mode when doing reads and only + * change it to unicast mode when doing writes of a specific instance. + * + * The setting of the multicast/unicast bit usually wouldn't matter for + * read operations (which always return the value from a single register + * instance regardless of how that bit is set), but some platforms may + * have workarounds requiring us to remain in multicast mode for reads, + * e.g. Wa_22013088509 on PVC. There's no real downside to this, so + * we'll just go ahead and do so on all platforms; we'll only clear the + * multicast bit from the mask when explicitly doing a write operation. + * + * No need to save old steering reg value. + */ + if (rw_flag == MCR_OP_READ) + steer_val |= MCR_MULTICAST; + + xe_mmio_write32(gt, steer_reg, steer_val); + + if (rw_flag == MCR_OP_READ) + val = xe_mmio_read32(gt, reg); + else + xe_mmio_write32(gt, reg, value); + + /* + * If we turned off the multicast bit (during a write) we're required + * to turn it back on before finishing. The group and instance values + * don't matter since they'll be re-programmed on the next MCR + * operation. + */ + if (rw_flag == MCR_OP_WRITE) + xe_mmio_write32(gt, steer_reg, MCR_MULTICAST); + + return val; +} + +/** + * xe_gt_mcr_unicast_read_any - reads a non-terminated instance of an MCR register + * @gt: GT structure + * @reg_mcr: register to read + * + * Reads a GT MCR register. The read will be steered to a non-terminated + * instance (i.e., one that isn't fused off or powered down by power gating). + * This function assumes the caller is already holding any necessary forcewake + * domains. + * + * Returns the value from a non-terminated instance of @reg. + */ +u32 xe_gt_mcr_unicast_read_any(struct xe_gt *gt, struct xe_reg_mcr reg_mcr) +{ + const struct xe_reg reg = to_xe_reg(reg_mcr); + u8 group, instance; + u32 val; + bool steer; + + steer = xe_gt_mcr_get_nonterminated_steering(gt, reg_mcr, + &group, &instance); + + if (steer) { + mcr_lock(gt); + val = rw_with_mcr_steering(gt, reg_mcr, MCR_OP_READ, + group, instance, 0); + mcr_unlock(gt); + } else { + val = xe_mmio_read32(gt, reg); + } + + return val; +} + +/** + * xe_gt_mcr_unicast_read - read a specific instance of an MCR register + * @gt: GT structure + * @reg_mcr: the MCR register to read + * @group: the MCR group + * @instance: the MCR instance + * + * Returns the value read from an MCR register after steering toward a specific + * group/instance. + */ +u32 xe_gt_mcr_unicast_read(struct xe_gt *gt, + struct xe_reg_mcr reg_mcr, + int group, int instance) +{ + u32 val; + + mcr_lock(gt); + val = rw_with_mcr_steering(gt, reg_mcr, MCR_OP_READ, group, instance, 0); + mcr_unlock(gt); + + return val; +} + +/** + * xe_gt_mcr_unicast_write - write a specific instance of an MCR register + * @gt: GT structure + * @reg_mcr: the MCR register to write + * @value: value to write + * @group: the MCR group + * @instance: the MCR instance + * + * Write an MCR register in unicast mode after steering toward a specific + * group/instance. + */ +void xe_gt_mcr_unicast_write(struct xe_gt *gt, struct xe_reg_mcr reg_mcr, + u32 value, int group, int instance) +{ + mcr_lock(gt); + rw_with_mcr_steering(gt, reg_mcr, MCR_OP_WRITE, group, instance, value); + mcr_unlock(gt); +} + +/** + * xe_gt_mcr_multicast_write - write a value to all instances of an MCR register + * @gt: GT structure + * @reg_mcr: the MCR register to write + * @value: value to write + * + * Write an MCR register in multicast mode to update all instances. + */ +void xe_gt_mcr_multicast_write(struct xe_gt *gt, struct xe_reg_mcr reg_mcr, + u32 value) +{ + struct xe_reg reg = to_xe_reg(reg_mcr); + + /* + * Synchronize with any unicast operations. Once we have exclusive + * access, the MULTICAST bit should already be set, so there's no need + * to touch the steering register. + */ + mcr_lock(gt); + xe_mmio_write32(gt, reg, value); + mcr_unlock(gt); +} + +void xe_gt_mcr_steering_dump(struct xe_gt *gt, struct drm_printer *p) +{ + for (int i = 0; i < NUM_STEERING_TYPES; i++) { + if (gt->steering[i].ranges) { + drm_printf(p, "%s steering: group=%#x, instance=%#x\n", + xe_steering_types[i].name, + gt->steering[i].group_target, + gt->steering[i].instance_target); + for (int j = 0; gt->steering[i].ranges[j].end; j++) + drm_printf(p, "\t0x%06x - 0x%06x\n", + gt->steering[i].ranges[j].start, + gt->steering[i].ranges[j].end); + } + } +} diff --git a/drivers/gpu/drm/xe/xe_gt_mcr.h b/drivers/gpu/drm/xe/xe_gt_mcr.h new file mode 100644 index 000000000000..27ca1bc880a0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_mcr.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GT_MCR_H_ +#define _XE_GT_MCR_H_ + +#include "regs/xe_reg_defs.h" + +struct drm_printer; +struct xe_gt; + +void xe_gt_mcr_init(struct xe_gt *gt); + +void xe_gt_mcr_set_implicit_defaults(struct xe_gt *gt); + +u32 xe_gt_mcr_unicast_read(struct xe_gt *gt, struct xe_reg_mcr mcr_reg, + int group, int instance); +u32 xe_gt_mcr_unicast_read_any(struct xe_gt *gt, struct xe_reg_mcr mcr_reg); + +void xe_gt_mcr_unicast_write(struct xe_gt *gt, struct xe_reg_mcr mcr_reg, + u32 value, int group, int instance); +void xe_gt_mcr_multicast_write(struct xe_gt *gt, struct xe_reg_mcr mcr_reg, + u32 value); + +void xe_gt_mcr_steering_dump(struct xe_gt *gt, struct drm_printer *p); + +#endif /* _XE_GT_MCR_H_ */ diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c new file mode 100644 index 000000000000..59a70d2e0a7a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -0,0 +1,646 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_gt_pagefault.h" + +#include <linux/bitfield.h> +#include <linux/circ_buf.h> + +#include <drm/drm_exec.h> +#include <drm/drm_managed.h> +#include <drm/ttm/ttm_execbuf_util.h> + +#include "abi/guc_actions_abi.h" +#include "xe_bo.h" +#include "xe_gt.h" +#include "xe_gt_tlb_invalidation.h" +#include "xe_guc.h" +#include "xe_guc_ct.h" +#include "xe_migrate.h" +#include "xe_pt.h" +#include "xe_trace.h" +#include "xe_vm.h" + +struct pagefault { + u64 page_addr; + u32 asid; + u16 pdata; + u8 vfid; + u8 access_type; + u8 fault_type; + u8 fault_level; + u8 engine_class; + u8 engine_instance; + u8 fault_unsuccessful; + bool trva_fault; +}; + +enum access_type { + ACCESS_TYPE_READ = 0, + ACCESS_TYPE_WRITE = 1, + ACCESS_TYPE_ATOMIC = 2, + ACCESS_TYPE_RESERVED = 3, +}; + +enum fault_type { + NOT_PRESENT = 0, + WRITE_ACCESS_VIOLATION = 1, + ATOMIC_ACCESS_VIOLATION = 2, +}; + +struct acc { + u64 va_range_base; + u32 asid; + u32 sub_granularity; + u8 granularity; + u8 vfid; + u8 access_type; + u8 engine_class; + u8 engine_instance; +}; + +static bool access_is_atomic(enum access_type access_type) +{ + return access_type == ACCESS_TYPE_ATOMIC; +} + +static bool vma_is_valid(struct xe_tile *tile, struct xe_vma *vma) +{ + return BIT(tile->id) & vma->tile_present && + !(BIT(tile->id) & vma->usm.tile_invalidated); +} + +static bool vma_matches(struct xe_vma *vma, u64 page_addr) +{ + if (page_addr > xe_vma_end(vma) - 1 || + page_addr + SZ_4K - 1 < xe_vma_start(vma)) + return false; + + return true; +} + +static struct xe_vma *lookup_vma(struct xe_vm *vm, u64 page_addr) +{ + struct xe_vma *vma = NULL; + + if (vm->usm.last_fault_vma) { /* Fast lookup */ + if (vma_matches(vm->usm.last_fault_vma, page_addr)) + vma = vm->usm.last_fault_vma; + } + if (!vma) + vma = xe_vm_find_overlapping_vma(vm, page_addr, SZ_4K); + + return vma; +} + +static int xe_pf_begin(struct drm_exec *exec, struct xe_vma *vma, + bool atomic, unsigned int id) +{ + struct xe_bo *bo = xe_vma_bo(vma); + struct xe_vm *vm = xe_vma_vm(vma); + unsigned int num_shared = 2; /* slots for bind + move */ + int err; + + err = xe_vm_prepare_vma(exec, vma, num_shared); + if (err) + return err; + + if (atomic && IS_DGFX(vm->xe)) { + if (xe_vma_is_userptr(vma)) { + err = -EACCES; + return err; + } + + /* Migrate to VRAM, move should invalidate the VMA first */ + err = xe_bo_migrate(bo, XE_PL_VRAM0 + id); + if (err) + return err; + } else if (bo) { + /* Create backing store if needed */ + err = xe_bo_validate(bo, vm, true); + if (err) + return err; + } + + return 0; +} + +static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf) +{ + struct xe_device *xe = gt_to_xe(gt); + struct xe_tile *tile = gt_to_tile(gt); + struct drm_exec exec; + struct xe_vm *vm; + struct xe_vma *vma = NULL; + struct dma_fence *fence; + bool write_locked; + int ret = 0; + bool atomic; + + /* SW isn't expected to handle TRTT faults */ + if (pf->trva_fault) + return -EFAULT; + + /* ASID to VM */ + mutex_lock(&xe->usm.lock); + vm = xa_load(&xe->usm.asid_to_vm, pf->asid); + if (vm) + xe_vm_get(vm); + mutex_unlock(&xe->usm.lock); + if (!vm || !xe_vm_in_fault_mode(vm)) + return -EINVAL; + +retry_userptr: + /* + * TODO: Avoid exclusive lock if VM doesn't have userptrs, or + * start out read-locked? + */ + down_write(&vm->lock); + write_locked = true; + vma = lookup_vma(vm, pf->page_addr); + if (!vma) { + ret = -EINVAL; + goto unlock_vm; + } + + if (!xe_vma_is_userptr(vma) || !xe_vma_userptr_check_repin(vma)) { + downgrade_write(&vm->lock); + write_locked = false; + } + + trace_xe_vma_pagefault(vma); + + atomic = access_is_atomic(pf->access_type); + + /* Check if VMA is valid */ + if (vma_is_valid(tile, vma) && !atomic) + goto unlock_vm; + + /* TODO: Validate fault */ + + if (xe_vma_is_userptr(vma) && write_locked) { + spin_lock(&vm->userptr.invalidated_lock); + list_del_init(&vma->userptr.invalidate_link); + spin_unlock(&vm->userptr.invalidated_lock); + + ret = xe_vma_userptr_pin_pages(vma); + if (ret) + goto unlock_vm; + + downgrade_write(&vm->lock); + write_locked = false; + } + + /* Lock VM and BOs dma-resv */ + drm_exec_init(&exec, 0, 0); + drm_exec_until_all_locked(&exec) { + ret = xe_pf_begin(&exec, vma, atomic, tile->id); + drm_exec_retry_on_contention(&exec); + if (ret) + goto unlock_dma_resv; + } + + /* Bind VMA only to the GT that has faulted */ + trace_xe_vma_pf_bind(vma); + fence = __xe_pt_bind_vma(tile, vma, xe_tile_migrate_engine(tile), NULL, 0, + vma->tile_present & BIT(tile->id)); + if (IS_ERR(fence)) { + ret = PTR_ERR(fence); + goto unlock_dma_resv; + } + + /* + * XXX: Should we drop the lock before waiting? This only helps if doing + * GPU binds which is currently only done if we have to wait for more + * than 10ms on a move. + */ + dma_fence_wait(fence, false); + dma_fence_put(fence); + + if (xe_vma_is_userptr(vma)) + ret = xe_vma_userptr_check_repin(vma); + vma->usm.tile_invalidated &= ~BIT(tile->id); + +unlock_dma_resv: + drm_exec_fini(&exec); +unlock_vm: + if (!ret) + vm->usm.last_fault_vma = vma; + if (write_locked) + up_write(&vm->lock); + else + up_read(&vm->lock); + if (ret == -EAGAIN) + goto retry_userptr; + + if (!ret) { + ret = xe_gt_tlb_invalidation_vma(gt, NULL, vma); + if (ret >= 0) + ret = 0; + } + xe_vm_put(vm); + + return ret; +} + +static int send_pagefault_reply(struct xe_guc *guc, + struct xe_guc_pagefault_reply *reply) +{ + u32 action[] = { + XE_GUC_ACTION_PAGE_FAULT_RES_DESC, + reply->dw0, + reply->dw1, + }; + + return xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), 0, 0); +} + +static void print_pagefault(struct xe_device *xe, struct pagefault *pf) +{ + drm_dbg(&xe->drm, "\n\tASID: %d\n" + "\tVFID: %d\n" + "\tPDATA: 0x%04x\n" + "\tFaulted Address: 0x%08x%08x\n" + "\tFaultType: %d\n" + "\tAccessType: %d\n" + "\tFaultLevel: %d\n" + "\tEngineClass: %d\n" + "\tEngineInstance: %d\n", + pf->asid, pf->vfid, pf->pdata, upper_32_bits(pf->page_addr), + lower_32_bits(pf->page_addr), + pf->fault_type, pf->access_type, pf->fault_level, + pf->engine_class, pf->engine_instance); +} + +#define PF_MSG_LEN_DW 4 + +static bool get_pagefault(struct pf_queue *pf_queue, struct pagefault *pf) +{ + const struct xe_guc_pagefault_desc *desc; + bool ret = false; + + spin_lock_irq(&pf_queue->lock); + if (pf_queue->head != pf_queue->tail) { + desc = (const struct xe_guc_pagefault_desc *) + (pf_queue->data + pf_queue->head); + + pf->fault_level = FIELD_GET(PFD_FAULT_LEVEL, desc->dw0); + pf->trva_fault = FIELD_GET(XE2_PFD_TRVA_FAULT, desc->dw0); + pf->engine_class = FIELD_GET(PFD_ENG_CLASS, desc->dw0); + pf->engine_instance = FIELD_GET(PFD_ENG_INSTANCE, desc->dw0); + pf->pdata = FIELD_GET(PFD_PDATA_HI, desc->dw1) << + PFD_PDATA_HI_SHIFT; + pf->pdata |= FIELD_GET(PFD_PDATA_LO, desc->dw0); + pf->asid = FIELD_GET(PFD_ASID, desc->dw1); + pf->vfid = FIELD_GET(PFD_VFID, desc->dw2); + pf->access_type = FIELD_GET(PFD_ACCESS_TYPE, desc->dw2); + pf->fault_type = FIELD_GET(PFD_FAULT_TYPE, desc->dw2); + pf->page_addr = (u64)(FIELD_GET(PFD_VIRTUAL_ADDR_HI, desc->dw3)) << + PFD_VIRTUAL_ADDR_HI_SHIFT; + pf->page_addr |= FIELD_GET(PFD_VIRTUAL_ADDR_LO, desc->dw2) << + PFD_VIRTUAL_ADDR_LO_SHIFT; + + pf_queue->head = (pf_queue->head + PF_MSG_LEN_DW) % + PF_QUEUE_NUM_DW; + ret = true; + } + spin_unlock_irq(&pf_queue->lock); + + return ret; +} + +static bool pf_queue_full(struct pf_queue *pf_queue) +{ + lockdep_assert_held(&pf_queue->lock); + + return CIRC_SPACE(pf_queue->tail, pf_queue->head, PF_QUEUE_NUM_DW) <= + PF_MSG_LEN_DW; +} + +int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len) +{ + struct xe_gt *gt = guc_to_gt(guc); + struct xe_device *xe = gt_to_xe(gt); + struct pf_queue *pf_queue; + unsigned long flags; + u32 asid; + bool full; + + if (unlikely(len != PF_MSG_LEN_DW)) + return -EPROTO; + + asid = FIELD_GET(PFD_ASID, msg[1]); + pf_queue = >->usm.pf_queue[asid % NUM_PF_QUEUE]; + + spin_lock_irqsave(&pf_queue->lock, flags); + full = pf_queue_full(pf_queue); + if (!full) { + memcpy(pf_queue->data + pf_queue->tail, msg, len * sizeof(u32)); + pf_queue->tail = (pf_queue->tail + len) % PF_QUEUE_NUM_DW; + queue_work(gt->usm.pf_wq, &pf_queue->worker); + } else { + drm_warn(&xe->drm, "PF Queue full, shouldn't be possible"); + } + spin_unlock_irqrestore(&pf_queue->lock, flags); + + return full ? -ENOSPC : 0; +} + +#define USM_QUEUE_MAX_RUNTIME_MS 20 + +static void pf_queue_work_func(struct work_struct *w) +{ + struct pf_queue *pf_queue = container_of(w, struct pf_queue, worker); + struct xe_gt *gt = pf_queue->gt; + struct xe_device *xe = gt_to_xe(gt); + struct xe_guc_pagefault_reply reply = {}; + struct pagefault pf = {}; + unsigned long threshold; + int ret; + + threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS); + + while (get_pagefault(pf_queue, &pf)) { + ret = handle_pagefault(gt, &pf); + if (unlikely(ret)) { + print_pagefault(xe, &pf); + pf.fault_unsuccessful = 1; + drm_dbg(&xe->drm, "Fault response: Unsuccessful %d\n", ret); + } + + reply.dw0 = FIELD_PREP(PFR_VALID, 1) | + FIELD_PREP(PFR_SUCCESS, pf.fault_unsuccessful) | + FIELD_PREP(PFR_REPLY, PFR_ACCESS) | + FIELD_PREP(PFR_DESC_TYPE, FAULT_RESPONSE_DESC) | + FIELD_PREP(PFR_ASID, pf.asid); + + reply.dw1 = FIELD_PREP(PFR_VFID, pf.vfid) | + FIELD_PREP(PFR_ENG_INSTANCE, pf.engine_instance) | + FIELD_PREP(PFR_ENG_CLASS, pf.engine_class) | + FIELD_PREP(PFR_PDATA, pf.pdata); + + send_pagefault_reply(>->uc.guc, &reply); + + if (time_after(jiffies, threshold) && + pf_queue->head != pf_queue->tail) { + queue_work(gt->usm.pf_wq, w); + break; + } + } +} + +static void acc_queue_work_func(struct work_struct *w); + +int xe_gt_pagefault_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + int i; + + if (!xe->info.has_usm) + return 0; + + for (i = 0; i < NUM_PF_QUEUE; ++i) { + gt->usm.pf_queue[i].gt = gt; + spin_lock_init(>->usm.pf_queue[i].lock); + INIT_WORK(>->usm.pf_queue[i].worker, pf_queue_work_func); + } + for (i = 0; i < NUM_ACC_QUEUE; ++i) { + gt->usm.acc_queue[i].gt = gt; + spin_lock_init(>->usm.acc_queue[i].lock); + INIT_WORK(>->usm.acc_queue[i].worker, acc_queue_work_func); + } + + gt->usm.pf_wq = alloc_workqueue("xe_gt_page_fault_work_queue", + WQ_UNBOUND | WQ_HIGHPRI, NUM_PF_QUEUE); + if (!gt->usm.pf_wq) + return -ENOMEM; + + gt->usm.acc_wq = alloc_workqueue("xe_gt_access_counter_work_queue", + WQ_UNBOUND | WQ_HIGHPRI, + NUM_ACC_QUEUE); + if (!gt->usm.acc_wq) + return -ENOMEM; + + return 0; +} + +void xe_gt_pagefault_reset(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + int i; + + if (!xe->info.has_usm) + return; + + for (i = 0; i < NUM_PF_QUEUE; ++i) { + spin_lock_irq(>->usm.pf_queue[i].lock); + gt->usm.pf_queue[i].head = 0; + gt->usm.pf_queue[i].tail = 0; + spin_unlock_irq(>->usm.pf_queue[i].lock); + } + + for (i = 0; i < NUM_ACC_QUEUE; ++i) { + spin_lock(>->usm.acc_queue[i].lock); + gt->usm.acc_queue[i].head = 0; + gt->usm.acc_queue[i].tail = 0; + spin_unlock(>->usm.acc_queue[i].lock); + } +} + +static int granularity_in_byte(int val) +{ + switch (val) { + case 0: + return SZ_128K; + case 1: + return SZ_2M; + case 2: + return SZ_16M; + case 3: + return SZ_64M; + default: + return 0; + } +} + +static int sub_granularity_in_byte(int val) +{ + return (granularity_in_byte(val) / 32); +} + +static void print_acc(struct xe_device *xe, struct acc *acc) +{ + drm_warn(&xe->drm, "Access counter request:\n" + "\tType: %s\n" + "\tASID: %d\n" + "\tVFID: %d\n" + "\tEngine: %d:%d\n" + "\tGranularity: 0x%x KB Region/ %d KB sub-granularity\n" + "\tSub_Granularity Vector: 0x%08x\n" + "\tVA Range base: 0x%016llx\n", + acc->access_type ? "AC_NTFY_VAL" : "AC_TRIG_VAL", + acc->asid, acc->vfid, acc->engine_class, acc->engine_instance, + granularity_in_byte(acc->granularity) / SZ_1K, + sub_granularity_in_byte(acc->granularity) / SZ_1K, + acc->sub_granularity, acc->va_range_base); +} + +static struct xe_vma *get_acc_vma(struct xe_vm *vm, struct acc *acc) +{ + u64 page_va = acc->va_range_base + (ffs(acc->sub_granularity) - 1) * + sub_granularity_in_byte(acc->granularity); + + return xe_vm_find_overlapping_vma(vm, page_va, SZ_4K); +} + +static int handle_acc(struct xe_gt *gt, struct acc *acc) +{ + struct xe_device *xe = gt_to_xe(gt); + struct xe_tile *tile = gt_to_tile(gt); + struct drm_exec exec; + struct xe_vm *vm; + struct xe_vma *vma; + int ret = 0; + + /* We only support ACC_TRIGGER at the moment */ + if (acc->access_type != ACC_TRIGGER) + return -EINVAL; + + /* ASID to VM */ + mutex_lock(&xe->usm.lock); + vm = xa_load(&xe->usm.asid_to_vm, acc->asid); + if (vm) + xe_vm_get(vm); + mutex_unlock(&xe->usm.lock); + if (!vm || !xe_vm_in_fault_mode(vm)) + return -EINVAL; + + down_read(&vm->lock); + + /* Lookup VMA */ + vma = get_acc_vma(vm, acc); + if (!vma) { + ret = -EINVAL; + goto unlock_vm; + } + + trace_xe_vma_acc(vma); + + /* Userptr or null can't be migrated, nothing to do */ + if (xe_vma_has_no_bo(vma)) + goto unlock_vm; + + /* Lock VM and BOs dma-resv */ + drm_exec_init(&exec, 0, 0); + drm_exec_until_all_locked(&exec) { + ret = xe_pf_begin(&exec, vma, true, tile->id); + drm_exec_retry_on_contention(&exec); + if (ret) + break; + } + + drm_exec_fini(&exec); +unlock_vm: + up_read(&vm->lock); + xe_vm_put(vm); + + return ret; +} + +#define make_u64(hi__, low__) ((u64)(hi__) << 32 | (u64)(low__)) + +#define ACC_MSG_LEN_DW 4 + +static bool get_acc(struct acc_queue *acc_queue, struct acc *acc) +{ + const struct xe_guc_acc_desc *desc; + bool ret = false; + + spin_lock(&acc_queue->lock); + if (acc_queue->head != acc_queue->tail) { + desc = (const struct xe_guc_acc_desc *) + (acc_queue->data + acc_queue->head); + + acc->granularity = FIELD_GET(ACC_GRANULARITY, desc->dw2); + acc->sub_granularity = FIELD_GET(ACC_SUBG_HI, desc->dw1) << 31 | + FIELD_GET(ACC_SUBG_LO, desc->dw0); + acc->engine_class = FIELD_GET(ACC_ENG_CLASS, desc->dw1); + acc->engine_instance = FIELD_GET(ACC_ENG_INSTANCE, desc->dw1); + acc->asid = FIELD_GET(ACC_ASID, desc->dw1); + acc->vfid = FIELD_GET(ACC_VFID, desc->dw2); + acc->access_type = FIELD_GET(ACC_TYPE, desc->dw0); + acc->va_range_base = make_u64(desc->dw3 & ACC_VIRTUAL_ADDR_RANGE_HI, + desc->dw2 & ACC_VIRTUAL_ADDR_RANGE_LO); + + acc_queue->head = (acc_queue->head + ACC_MSG_LEN_DW) % + ACC_QUEUE_NUM_DW; + ret = true; + } + spin_unlock(&acc_queue->lock); + + return ret; +} + +static void acc_queue_work_func(struct work_struct *w) +{ + struct acc_queue *acc_queue = container_of(w, struct acc_queue, worker); + struct xe_gt *gt = acc_queue->gt; + struct xe_device *xe = gt_to_xe(gt); + struct acc acc = {}; + unsigned long threshold; + int ret; + + threshold = jiffies + msecs_to_jiffies(USM_QUEUE_MAX_RUNTIME_MS); + + while (get_acc(acc_queue, &acc)) { + ret = handle_acc(gt, &acc); + if (unlikely(ret)) { + print_acc(xe, &acc); + drm_warn(&xe->drm, "ACC: Unsuccessful %d\n", ret); + } + + if (time_after(jiffies, threshold) && + acc_queue->head != acc_queue->tail) { + queue_work(gt->usm.acc_wq, w); + break; + } + } +} + +static bool acc_queue_full(struct acc_queue *acc_queue) +{ + lockdep_assert_held(&acc_queue->lock); + + return CIRC_SPACE(acc_queue->tail, acc_queue->head, ACC_QUEUE_NUM_DW) <= + ACC_MSG_LEN_DW; +} + +int xe_guc_access_counter_notify_handler(struct xe_guc *guc, u32 *msg, u32 len) +{ + struct xe_gt *gt = guc_to_gt(guc); + struct acc_queue *acc_queue; + u32 asid; + bool full; + + if (unlikely(len != ACC_MSG_LEN_DW)) + return -EPROTO; + + asid = FIELD_GET(ACC_ASID, msg[1]); + acc_queue = >->usm.acc_queue[asid % NUM_ACC_QUEUE]; + + spin_lock(&acc_queue->lock); + full = acc_queue_full(acc_queue); + if (!full) { + memcpy(acc_queue->data + acc_queue->tail, msg, + len * sizeof(u32)); + acc_queue->tail = (acc_queue->tail + len) % ACC_QUEUE_NUM_DW; + queue_work(gt->usm.acc_wq, &acc_queue->worker); + } else { + drm_warn(>_to_xe(gt)->drm, "ACC Queue full, dropping ACC"); + } + spin_unlock(&acc_queue->lock); + + return full ? -ENOSPC : 0; +} diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.h b/drivers/gpu/drm/xe/xe_gt_pagefault.h new file mode 100644 index 000000000000..839c065a5e4c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GT_PAGEFAULT_H_ +#define _XE_GT_PAGEFAULT_H_ + +#include <linux/types.h> + +struct xe_gt; +struct xe_guc; + +int xe_gt_pagefault_init(struct xe_gt *gt); +void xe_gt_pagefault_reset(struct xe_gt *gt); +int xe_guc_pagefault_handler(struct xe_guc *guc, u32 *msg, u32 len); +int xe_guc_access_counter_notify_handler(struct xe_guc *guc, u32 *msg, u32 len); + +#endif /* _XE_GT_PAGEFAULT_ */ diff --git a/drivers/gpu/drm/xe/xe_gt_printk.h b/drivers/gpu/drm/xe/xe_gt_printk.h new file mode 100644 index 000000000000..5991bcadd47e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_printk.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GT_PRINTK_H_ +#define _XE_GT_PRINTK_H_ + +#include <drm/drm_print.h> + +#include "xe_device_types.h" + +#define xe_gt_printk(_gt, _level, _fmt, ...) \ + drm_##_level(>_to_xe(_gt)->drm, "GT%u: " _fmt, (_gt)->info.id, ##__VA_ARGS__) + +#define xe_gt_err(_gt, _fmt, ...) \ + xe_gt_printk((_gt), err, _fmt, ##__VA_ARGS__) + +#define xe_gt_warn(_gt, _fmt, ...) \ + xe_gt_printk((_gt), warn, _fmt, ##__VA_ARGS__) + +#define xe_gt_notice(_gt, _fmt, ...) \ + xe_gt_printk((_gt), notice, _fmt, ##__VA_ARGS__) + +#define xe_gt_info(_gt, _fmt, ...) \ + xe_gt_printk((_gt), info, _fmt, ##__VA_ARGS__) + +#define xe_gt_dbg(_gt, _fmt, ...) \ + xe_gt_printk((_gt), dbg, _fmt, ##__VA_ARGS__) + +#define xe_gt_err_ratelimited(_gt, _fmt, ...) \ + xe_gt_printk((_gt), err_ratelimited, _fmt, ##__VA_ARGS__) + +#define xe_gt_WARN(_gt, _condition, _fmt, ...) \ + drm_WARN(>_to_xe(_gt)->drm, _condition, "GT%u: " _fmt, (_gt)->info.id, ##__VA_ARGS__) + +#define xe_gt_WARN_ONCE(_gt, _condition, _fmt, ...) \ + drm_WARN_ONCE(>_to_xe(_gt)->drm, _condition, "GT%u: " _fmt, (_gt)->info.id, ##__VA_ARGS__) + +#define xe_gt_WARN_ON(_gt, _condition) \ + xe_gt_WARN((_gt), _condition, "%s(%s)", "gt_WARN_ON", __stringify(_condition)) + +#define xe_gt_WARN_ON_ONCE(_gt, _condition) \ + xe_gt_WARN_ONCE((_gt), _condition, "%s(%s)", "gt_WARN_ON_ONCE", __stringify(_condition)) + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_sysfs.c b/drivers/gpu/drm/xe/xe_gt_sysfs.c new file mode 100644 index 000000000000..c69d2e8a0fe1 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_sysfs.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_gt_sysfs.h" + +#include <linux/kobject.h> +#include <linux/sysfs.h> + +#include <drm/drm_managed.h> + +#include "xe_gt.h" + +static void xe_gt_sysfs_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct kobj_type xe_gt_sysfs_kobj_type = { + .release = xe_gt_sysfs_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +static void gt_sysfs_fini(struct drm_device *drm, void *arg) +{ + struct xe_gt *gt = arg; + + kobject_put(gt->sysfs); +} + +void xe_gt_sysfs_init(struct xe_gt *gt) +{ + struct xe_tile *tile = gt_to_tile(gt); + struct xe_device *xe = gt_to_xe(gt); + struct kobj_gt *kg; + int err; + + kg = kzalloc(sizeof(*kg), GFP_KERNEL); + if (!kg) + return; + + kobject_init(&kg->base, &xe_gt_sysfs_kobj_type); + kg->gt = gt; + + err = kobject_add(&kg->base, tile->sysfs, "gt%d", gt->info.id); + if (err) { + drm_warn(&xe->drm, "failed to add GT sysfs directory, err: %d\n", err); + kobject_put(&kg->base); + return; + } + + gt->sysfs = &kg->base; + + err = drmm_add_action_or_reset(&xe->drm, gt_sysfs_fini, gt); + if (err) { + drm_warn(&xe->drm, "%s: drmm_add_action_or_reset failed, err: %d\n", + __func__, err); + return; + } +} diff --git a/drivers/gpu/drm/xe/xe_gt_sysfs.h b/drivers/gpu/drm/xe/xe_gt_sysfs.h new file mode 100644 index 000000000000..e3ec278ca0be --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_sysfs.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GT_SYSFS_H_ +#define _XE_GT_SYSFS_H_ + +#include "xe_gt_sysfs_types.h" + +void xe_gt_sysfs_init(struct xe_gt *gt); + +static inline struct xe_gt * +kobj_to_gt(struct kobject *kobj) +{ + return container_of(kobj, struct kobj_gt, base)->gt; +} + +#endif /* _XE_GT_SYSFS_H_ */ diff --git a/drivers/gpu/drm/xe/xe_gt_sysfs_types.h b/drivers/gpu/drm/xe/xe_gt_sysfs_types.h new file mode 100644 index 000000000000..d3bc6b83360f --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_sysfs_types.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GT_SYSFS_TYPES_H_ +#define _XE_GT_SYSFS_TYPES_H_ + +#include <linux/kobject.h> + +struct xe_gt; + +/** + * struct kobj_gt - A GT's kobject struct that connects the kobject and the GT + * + * When dealing with multiple GTs, this struct helps to understand which GT + * needs to be addressed on a given sysfs call. + */ +struct kobj_gt { + /** @base: The actual kobject */ + struct kobject base; + /** @gt: A pointer to the GT itself */ + struct xe_gt *gt; +}; + +#endif /* _XE_GT_SYSFS_TYPES_H_ */ diff --git a/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.c b/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.c new file mode 100644 index 000000000000..63d640591a52 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <drm/drm_managed.h> + +#include <regs/xe_gt_regs.h> +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_gt_sysfs.h" +#include "xe_gt_throttle_sysfs.h" +#include "xe_mmio.h" + +/** + * DOC: Xe GT Throttle + * + * Provides sysfs entries for frequency throttle reasons in GT + * + * device/gt#/freq0/throttle/status - Overall status + * device/gt#/freq0/throttle/reason_pl1 - Frequency throttle due to PL1 + * device/gt#/freq0/throttle/reason_pl2 - Frequency throttle due to PL2 + * device/gt#/freq0/throttle/reason_pl4 - Frequency throttle due to PL4, Iccmax etc. + * device/gt#/freq0/throttle/reason_thermal - Frequency throttle due to thermal + * device/gt#/freq0/throttle/reason_prochot - Frequency throttle due to prochot + * device/gt#/freq0/throttle/reason_ratl - Frequency throttle due to RATL + * device/gt#/freq0/throttle/reason_vr_thermalert - Frequency throttle due to VR THERMALERT + * device/gt#/freq0/throttle/reason_vr_tdc - Frequency throttle due to VR TDC + */ + +static struct xe_gt * +dev_to_gt(struct device *dev) +{ + return kobj_to_gt(dev->kobj.parent); +} + +static u32 read_perf_limit_reasons(struct xe_gt *gt) +{ + u32 reg; + + if (xe_gt_is_media_type(gt)) + reg = xe_mmio_read32(gt, MTL_MEDIA_PERF_LIMIT_REASONS); + else + reg = xe_mmio_read32(gt, GT0_PERF_LIMIT_REASONS); + + return reg; +} + +static u32 read_status(struct xe_gt *gt) +{ + u32 status = read_perf_limit_reasons(gt) & GT0_PERF_LIMIT_REASONS_MASK; + + return status; +} + +static u32 read_reason_pl1(struct xe_gt *gt) +{ + u32 pl1 = read_perf_limit_reasons(gt) & POWER_LIMIT_1_MASK; + + return pl1; +} + +static u32 read_reason_pl2(struct xe_gt *gt) +{ + u32 pl2 = read_perf_limit_reasons(gt) & POWER_LIMIT_2_MASK; + + return pl2; +} + +static u32 read_reason_pl4(struct xe_gt *gt) +{ + u32 pl4 = read_perf_limit_reasons(gt) & POWER_LIMIT_4_MASK; + + return pl4; +} + +static u32 read_reason_thermal(struct xe_gt *gt) +{ + u32 thermal = read_perf_limit_reasons(gt) & THERMAL_LIMIT_MASK; + + return thermal; +} + +static u32 read_reason_prochot(struct xe_gt *gt) +{ + u32 prochot = read_perf_limit_reasons(gt) & PROCHOT_MASK; + + return prochot; +} + +static u32 read_reason_ratl(struct xe_gt *gt) +{ + u32 ratl = read_perf_limit_reasons(gt) & RATL_MASK; + + return ratl; +} + +static u32 read_reason_vr_thermalert(struct xe_gt *gt) +{ + u32 thermalert = read_perf_limit_reasons(gt) & VR_THERMALERT_MASK; + + return thermalert; +} + +static u32 read_reason_vr_tdc(struct xe_gt *gt) +{ + u32 tdc = read_perf_limit_reasons(gt) & VR_TDC_MASK; + + return tdc; +} + +static ssize_t status_show(struct device *dev, + struct device_attribute *attr, + char *buff) +{ + struct xe_gt *gt = dev_to_gt(dev); + bool status = !!read_status(gt); + + return sysfs_emit(buff, "%u\n", status); +} +static DEVICE_ATTR_RO(status); + +static ssize_t reason_pl1_show(struct device *dev, + struct device_attribute *attr, + char *buff) +{ + struct xe_gt *gt = dev_to_gt(dev); + bool pl1 = !!read_reason_pl1(gt); + + return sysfs_emit(buff, "%u\n", pl1); +} +static DEVICE_ATTR_RO(reason_pl1); + +static ssize_t reason_pl2_show(struct device *dev, + struct device_attribute *attr, + char *buff) +{ + struct xe_gt *gt = dev_to_gt(dev); + bool pl2 = !!read_reason_pl2(gt); + + return sysfs_emit(buff, "%u\n", pl2); +} +static DEVICE_ATTR_RO(reason_pl2); + +static ssize_t reason_pl4_show(struct device *dev, + struct device_attribute *attr, + char *buff) +{ + struct xe_gt *gt = dev_to_gt(dev); + bool pl4 = !!read_reason_pl4(gt); + + return sysfs_emit(buff, "%u\n", pl4); +} +static DEVICE_ATTR_RO(reason_pl4); + +static ssize_t reason_thermal_show(struct device *dev, + struct device_attribute *attr, + char *buff) +{ + struct xe_gt *gt = dev_to_gt(dev); + bool thermal = !!read_reason_thermal(gt); + + return sysfs_emit(buff, "%u\n", thermal); +} +static DEVICE_ATTR_RO(reason_thermal); + +static ssize_t reason_prochot_show(struct device *dev, + struct device_attribute *attr, + char *buff) +{ + struct xe_gt *gt = dev_to_gt(dev); + bool prochot = !!read_reason_prochot(gt); + + return sysfs_emit(buff, "%u\n", prochot); +} +static DEVICE_ATTR_RO(reason_prochot); + +static ssize_t reason_ratl_show(struct device *dev, + struct device_attribute *attr, + char *buff) +{ + struct xe_gt *gt = dev_to_gt(dev); + bool ratl = !!read_reason_ratl(gt); + + return sysfs_emit(buff, "%u\n", ratl); +} +static DEVICE_ATTR_RO(reason_ratl); + +static ssize_t reason_vr_thermalert_show(struct device *dev, + struct device_attribute *attr, + char *buff) +{ + struct xe_gt *gt = dev_to_gt(dev); + bool thermalert = !!read_reason_vr_thermalert(gt); + + return sysfs_emit(buff, "%u\n", thermalert); +} +static DEVICE_ATTR_RO(reason_vr_thermalert); + +static ssize_t reason_vr_tdc_show(struct device *dev, + struct device_attribute *attr, + char *buff) +{ + struct xe_gt *gt = dev_to_gt(dev); + bool tdc = !!read_reason_vr_tdc(gt); + + return sysfs_emit(buff, "%u\n", tdc); +} +static DEVICE_ATTR_RO(reason_vr_tdc); + +static struct attribute *throttle_attrs[] = { + &dev_attr_status.attr, + &dev_attr_reason_pl1.attr, + &dev_attr_reason_pl2.attr, + &dev_attr_reason_pl4.attr, + &dev_attr_reason_thermal.attr, + &dev_attr_reason_prochot.attr, + &dev_attr_reason_ratl.attr, + &dev_attr_reason_vr_thermalert.attr, + &dev_attr_reason_vr_tdc.attr, + NULL +}; + +static const struct attribute_group throttle_group_attrs = { + .name = "throttle", + .attrs = throttle_attrs, +}; + +static void gt_throttle_sysfs_fini(struct drm_device *drm, void *arg) +{ + struct xe_gt *gt = arg; + + sysfs_remove_group(gt->freq, &throttle_group_attrs); +} + +void xe_gt_throttle_sysfs_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + int err; + + err = sysfs_create_group(gt->freq, &throttle_group_attrs); + if (err) { + drm_warn(&xe->drm, "failed to register throttle sysfs, err: %d\n", err); + return; + } + + err = drmm_add_action_or_reset(&xe->drm, gt_throttle_sysfs_fini, gt); + if (err) + drm_warn(&xe->drm, "%s: drmm_add_action_or_reset failed, err: %d\n", + __func__, err); +} diff --git a/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.h b/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.h new file mode 100644 index 000000000000..3ecfd4beffe1 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_throttle_sysfs.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GT_THROTTLE_SYSFS_H_ +#define _XE_GT_THROTTLE_SYSFS_H_ + +#include <drm/drm_managed.h> + +struct xe_gt; + +void xe_gt_throttle_sysfs_init(struct xe_gt *gt); + +#endif /* _XE_GT_THROTTLE_SYSFS_H_ */ + diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c new file mode 100644 index 000000000000..7eef23a00d77 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_gt_tlb_invalidation.h" + +#include "abi/guc_actions_abi.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_guc.h" +#include "xe_guc_ct.h" +#include "xe_trace.h" + +#define TLB_TIMEOUT (HZ / 4) + +static void xe_gt_tlb_fence_timeout(struct work_struct *work) +{ + struct xe_gt *gt = container_of(work, struct xe_gt, + tlb_invalidation.fence_tdr.work); + struct xe_gt_tlb_invalidation_fence *fence, *next; + + spin_lock_irq(>->tlb_invalidation.pending_lock); + list_for_each_entry_safe(fence, next, + >->tlb_invalidation.pending_fences, link) { + s64 since_inval_ms = ktime_ms_delta(ktime_get(), + fence->invalidation_time); + + if (msecs_to_jiffies(since_inval_ms) < TLB_TIMEOUT) + break; + + trace_xe_gt_tlb_invalidation_fence_timeout(fence); + drm_err(>_to_xe(gt)->drm, "gt%d: TLB invalidation fence timeout, seqno=%d recv=%d", + gt->info.id, fence->seqno, gt->tlb_invalidation.seqno_recv); + + list_del(&fence->link); + fence->base.error = -ETIME; + dma_fence_signal(&fence->base); + dma_fence_put(&fence->base); + } + if (!list_empty(>->tlb_invalidation.pending_fences)) + queue_delayed_work(system_wq, + >->tlb_invalidation.fence_tdr, + TLB_TIMEOUT); + spin_unlock_irq(>->tlb_invalidation.pending_lock); +} + +/** + * xe_gt_tlb_invalidation_init - Initialize GT TLB invalidation state + * @gt: graphics tile + * + * Initialize GT TLB invalidation state, purely software initialization, should + * be called once during driver load. + * + * Return: 0 on success, negative error code on error. + */ +int xe_gt_tlb_invalidation_init(struct xe_gt *gt) +{ + gt->tlb_invalidation.seqno = 1; + INIT_LIST_HEAD(>->tlb_invalidation.pending_fences); + spin_lock_init(>->tlb_invalidation.pending_lock); + spin_lock_init(>->tlb_invalidation.lock); + gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1); + INIT_DELAYED_WORK(>->tlb_invalidation.fence_tdr, + xe_gt_tlb_fence_timeout); + + return 0; +} + +static void +__invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence) +{ + trace_xe_gt_tlb_invalidation_fence_signal(fence); + dma_fence_signal(&fence->base); + dma_fence_put(&fence->base); +} + +static void +invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence) +{ + list_del(&fence->link); + __invalidation_fence_signal(fence); +} + +/** + * xe_gt_tlb_invalidation_reset - Initialize GT TLB invalidation reset + * @gt: graphics tile + * + * Signal any pending invalidation fences, should be called during a GT reset + */ +void xe_gt_tlb_invalidation_reset(struct xe_gt *gt) +{ + struct xe_gt_tlb_invalidation_fence *fence, *next; + struct xe_guc *guc = >->uc.guc; + int pending_seqno; + + /* + * CT channel is already disabled at this point. No new TLB requests can + * appear. + */ + + mutex_lock(>->uc.guc.ct.lock); + spin_lock_irq(>->tlb_invalidation.pending_lock); + cancel_delayed_work(>->tlb_invalidation.fence_tdr); + /* + * We might have various kworkers waiting for TLB flushes to complete + * which are not tracked with an explicit TLB fence, however at this + * stage that will never happen since the CT is already disabled, so + * make sure we signal them here under the assumption that we have + * completed a full GT reset. + */ + if (gt->tlb_invalidation.seqno == 1) + pending_seqno = TLB_INVALIDATION_SEQNO_MAX - 1; + else + pending_seqno = gt->tlb_invalidation.seqno - 1; + WRITE_ONCE(gt->tlb_invalidation.seqno_recv, pending_seqno); + wake_up_all(&guc->ct.wq); + + list_for_each_entry_safe(fence, next, + >->tlb_invalidation.pending_fences, link) + invalidation_fence_signal(fence); + spin_unlock_irq(>->tlb_invalidation.pending_lock); + mutex_unlock(>->uc.guc.ct.lock); +} + +static bool tlb_invalidation_seqno_past(struct xe_gt *gt, int seqno) +{ + int seqno_recv = READ_ONCE(gt->tlb_invalidation.seqno_recv); + + if (seqno - seqno_recv < -(TLB_INVALIDATION_SEQNO_MAX / 2)) + return false; + + if (seqno - seqno_recv > (TLB_INVALIDATION_SEQNO_MAX / 2)) + return true; + + return seqno_recv >= seqno; +} + +static int send_tlb_invalidation(struct xe_guc *guc, + struct xe_gt_tlb_invalidation_fence *fence, + u32 *action, int len) +{ + struct xe_gt *gt = guc_to_gt(guc); + int seqno; + int ret; + + /* + * XXX: The seqno algorithm relies on TLB invalidation being processed + * in order which they currently are, if that changes the algorithm will + * need to be updated. + */ + + mutex_lock(&guc->ct.lock); + seqno = gt->tlb_invalidation.seqno; + if (fence) { + fence->seqno = seqno; + trace_xe_gt_tlb_invalidation_fence_send(fence); + } + action[1] = seqno; + ret = xe_guc_ct_send_locked(&guc->ct, action, len, + G2H_LEN_DW_TLB_INVALIDATE, 1); + if (!ret && fence) { + spin_lock_irq(>->tlb_invalidation.pending_lock); + /* + * We haven't actually published the TLB fence as per + * pending_fences, but in theory our seqno could have already + * been written as we acquired the pending_lock. In such a case + * we can just go ahead and signal the fence here. + */ + if (tlb_invalidation_seqno_past(gt, seqno)) { + __invalidation_fence_signal(fence); + } else { + fence->invalidation_time = ktime_get(); + list_add_tail(&fence->link, + >->tlb_invalidation.pending_fences); + + if (list_is_singular(>->tlb_invalidation.pending_fences)) + queue_delayed_work(system_wq, + >->tlb_invalidation.fence_tdr, + TLB_TIMEOUT); + } + spin_unlock_irq(>->tlb_invalidation.pending_lock); + } else if (ret < 0 && fence) { + __invalidation_fence_signal(fence); + } + if (!ret) { + gt->tlb_invalidation.seqno = (gt->tlb_invalidation.seqno + 1) % + TLB_INVALIDATION_SEQNO_MAX; + if (!gt->tlb_invalidation.seqno) + gt->tlb_invalidation.seqno = 1; + ret = seqno; + } + mutex_unlock(&guc->ct.lock); + + return ret; +} + +#define MAKE_INVAL_OP(type) ((type << XE_GUC_TLB_INVAL_TYPE_SHIFT) | \ + XE_GUC_TLB_INVAL_MODE_HEAVY << XE_GUC_TLB_INVAL_MODE_SHIFT | \ + XE_GUC_TLB_INVAL_FLUSH_CACHE) + +/** + * xe_gt_tlb_invalidation_guc - Issue a TLB invalidation on this GT for the GuC + * @gt: graphics tile + * + * Issue a TLB invalidation for the GuC. Completion of TLB is asynchronous and + * caller can use seqno + xe_gt_tlb_invalidation_wait to wait for completion. + * + * Return: Seqno which can be passed to xe_gt_tlb_invalidation_wait on success, + * negative error code on error. + */ +int xe_gt_tlb_invalidation_guc(struct xe_gt *gt) +{ + u32 action[] = { + XE_GUC_ACTION_TLB_INVALIDATION, + 0, /* seqno, replaced in send_tlb_invalidation */ + MAKE_INVAL_OP(XE_GUC_TLB_INVAL_GUC), + }; + + return send_tlb_invalidation(>->uc.guc, NULL, action, + ARRAY_SIZE(action)); +} + +/** + * xe_gt_tlb_invalidation_vma - Issue a TLB invalidation on this GT for a VMA + * @gt: graphics tile + * @fence: invalidation fence which will be signal on TLB invalidation + * completion, can be NULL + * @vma: VMA to invalidate + * + * Issue a range based TLB invalidation if supported, if not fallback to a full + * TLB invalidation. Completion of TLB is asynchronous and caller can either use + * the invalidation fence or seqno + xe_gt_tlb_invalidation_wait to wait for + * completion. + * + * Return: Seqno which can be passed to xe_gt_tlb_invalidation_wait on success, + * negative error code on error. + */ +int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, + struct xe_gt_tlb_invalidation_fence *fence, + struct xe_vma *vma) +{ + struct xe_device *xe = gt_to_xe(gt); +#define MAX_TLB_INVALIDATION_LEN 7 + u32 action[MAX_TLB_INVALIDATION_LEN]; + int len = 0; + + xe_gt_assert(gt, vma); + + action[len++] = XE_GUC_ACTION_TLB_INVALIDATION; + action[len++] = 0; /* seqno, replaced in send_tlb_invalidation */ + if (!xe->info.has_range_tlb_invalidation) { + action[len++] = MAKE_INVAL_OP(XE_GUC_TLB_INVAL_FULL); + } else { + u64 start = xe_vma_start(vma); + u64 length = xe_vma_size(vma); + u64 align, end; + + if (length < SZ_4K) + length = SZ_4K; + + /* + * We need to invalidate a higher granularity if start address + * is not aligned to length. When start is not aligned with + * length we need to find the length large enough to create an + * address mask covering the required range. + */ + align = roundup_pow_of_two(length); + start = ALIGN_DOWN(xe_vma_start(vma), align); + end = ALIGN(xe_vma_end(vma), align); + length = align; + while (start + length < end) { + length <<= 1; + start = ALIGN_DOWN(xe_vma_start(vma), length); + } + + /* + * Minimum invalidation size for a 2MB page that the hardware + * expects is 16MB + */ + if (length >= SZ_2M) { + length = max_t(u64, SZ_16M, length); + start = ALIGN_DOWN(xe_vma_start(vma), length); + } + + xe_gt_assert(gt, length >= SZ_4K); + xe_gt_assert(gt, is_power_of_2(length)); + xe_gt_assert(gt, !(length & GENMASK(ilog2(SZ_16M) - 1, ilog2(SZ_2M) + 1))); + xe_gt_assert(gt, IS_ALIGNED(start, length)); + + action[len++] = MAKE_INVAL_OP(XE_GUC_TLB_INVAL_PAGE_SELECTIVE); + action[len++] = xe_vma_vm(vma)->usm.asid; + action[len++] = lower_32_bits(start); + action[len++] = upper_32_bits(start); + action[len++] = ilog2(length) - ilog2(SZ_4K); + } + + xe_gt_assert(gt, len <= MAX_TLB_INVALIDATION_LEN); + + return send_tlb_invalidation(>->uc.guc, fence, action, len); +} + +/** + * xe_gt_tlb_invalidation_wait - Wait for TLB to complete + * @gt: graphics tile + * @seqno: seqno to wait which was returned from xe_gt_tlb_invalidation + * + * Wait for 200ms for a TLB invalidation to complete, in practice we always + * should receive the TLB invalidation within 200ms. + * + * Return: 0 on success, -ETIME on TLB invalidation timeout + */ +int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno) +{ + struct xe_device *xe = gt_to_xe(gt); + struct xe_guc *guc = >->uc.guc; + struct drm_printer p = drm_err_printer(__func__); + int ret; + + /* + * XXX: See above, this algorithm only works if seqno are always in + * order + */ + ret = wait_event_timeout(guc->ct.wq, + tlb_invalidation_seqno_past(gt, seqno), + TLB_TIMEOUT); + if (!ret) { + drm_err(&xe->drm, "gt%d: TLB invalidation time'd out, seqno=%d, recv=%d\n", + gt->info.id, seqno, gt->tlb_invalidation.seqno_recv); + xe_guc_ct_print(&guc->ct, &p, true); + return -ETIME; + } + + return 0; +} + +/** + * xe_guc_tlb_invalidation_done_handler - TLB invalidation done handler + * @guc: guc + * @msg: message indicating TLB invalidation done + * @len: length of message + * + * Parse seqno of TLB invalidation, wake any waiters for seqno, and signal any + * invalidation fences for seqno. Algorithm for this depends on seqno being + * received in-order and asserts this assumption. + * + * Return: 0 on success, -EPROTO for malformed messages. + */ +int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len) +{ + struct xe_gt *gt = guc_to_gt(guc); + struct xe_gt_tlb_invalidation_fence *fence, *next; + unsigned long flags; + + if (unlikely(len != 1)) + return -EPROTO; + + /* + * This can also be run both directly from the IRQ handler and also in + * process_g2h_msg(). Only one may process any individual CT message, + * however the order they are processed here could result in skipping a + * seqno. To handle that we just process all the seqnos from the last + * seqno_recv up to and including the one in msg[0]. The delta should be + * very small so there shouldn't be much of pending_fences we actually + * need to iterate over here. + * + * From GuC POV we expect the seqnos to always appear in-order, so if we + * see something later in the timeline we can be sure that anything + * appearing earlier has already signalled, just that we have yet to + * officially process the CT message like if racing against + * process_g2h_msg(). + */ + spin_lock_irqsave(>->tlb_invalidation.pending_lock, flags); + if (tlb_invalidation_seqno_past(gt, msg[0])) { + spin_unlock_irqrestore(>->tlb_invalidation.pending_lock, flags); + return 0; + } + + /* + * wake_up_all() and wait_event_timeout() already have the correct + * barriers. + */ + WRITE_ONCE(gt->tlb_invalidation.seqno_recv, msg[0]); + wake_up_all(&guc->ct.wq); + + list_for_each_entry_safe(fence, next, + >->tlb_invalidation.pending_fences, link) { + trace_xe_gt_tlb_invalidation_fence_recv(fence); + + if (!tlb_invalidation_seqno_past(gt, fence->seqno)) + break; + + invalidation_fence_signal(fence); + } + + if (!list_empty(>->tlb_invalidation.pending_fences)) + mod_delayed_work(system_wq, + >->tlb_invalidation.fence_tdr, + TLB_TIMEOUT); + else + cancel_delayed_work(>->tlb_invalidation.fence_tdr); + + spin_unlock_irqrestore(>->tlb_invalidation.pending_lock, flags); + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h new file mode 100644 index 000000000000..b333c1709397 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GT_TLB_INVALIDATION_H_ +#define _XE_GT_TLB_INVALIDATION_H_ + +#include <linux/types.h> + +#include "xe_gt_tlb_invalidation_types.h" + +struct xe_gt; +struct xe_guc; +struct xe_vma; + +int xe_gt_tlb_invalidation_init(struct xe_gt *gt); +void xe_gt_tlb_invalidation_reset(struct xe_gt *gt); +int xe_gt_tlb_invalidation_guc(struct xe_gt *gt); +int xe_gt_tlb_invalidation_vma(struct xe_gt *gt, + struct xe_gt_tlb_invalidation_fence *fence, + struct xe_vma *vma); +int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno); +int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len); + +#endif /* _XE_GT_TLB_INVALIDATION_ */ diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h new file mode 100644 index 000000000000..934c828efe31 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation_types.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GT_TLB_INVALIDATION_TYPES_H_ +#define _XE_GT_TLB_INVALIDATION_TYPES_H_ + +#include <linux/dma-fence.h> + +/** + * struct xe_gt_tlb_invalidation_fence - XE GT TLB invalidation fence + * + * Optionally passed to xe_gt_tlb_invalidation and will be signaled upon TLB + * invalidation completion. + */ +struct xe_gt_tlb_invalidation_fence { + /** @base: dma fence base */ + struct dma_fence base; + /** @link: link into list of pending tlb fences */ + struct list_head link; + /** @seqno: seqno of TLB invalidation to signal fence one */ + int seqno; + /** @invalidation_time: time of TLB invalidation */ + ktime_t invalidation_time; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_gt_topology.c b/drivers/gpu/drm/xe/xe_gt_topology.c new file mode 100644 index 000000000000..a8d7f272c30a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_topology.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_gt_topology.h" + +#include <linux/bitmap.h> + +#include "regs/xe_gt_regs.h" +#include "xe_gt.h" +#include "xe_mmio.h" + +#define XE_MAX_DSS_FUSE_BITS (32 * XE_MAX_DSS_FUSE_REGS) +#define XE_MAX_EU_FUSE_BITS (32 * XE_MAX_EU_FUSE_REGS) + +static void +load_dss_mask(struct xe_gt *gt, xe_dss_mask_t mask, int numregs, ...) +{ + va_list argp; + u32 fuse_val[XE_MAX_DSS_FUSE_REGS] = {}; + int i; + + if (drm_WARN_ON(>_to_xe(gt)->drm, numregs > XE_MAX_DSS_FUSE_REGS)) + numregs = XE_MAX_DSS_FUSE_REGS; + + va_start(argp, numregs); + for (i = 0; i < numregs; i++) + fuse_val[i] = xe_mmio_read32(gt, va_arg(argp, struct xe_reg)); + va_end(argp); + + bitmap_from_arr32(mask, fuse_val, numregs * 32); +} + +static void +load_eu_mask(struct xe_gt *gt, xe_eu_mask_t mask) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 reg_val = xe_mmio_read32(gt, XELP_EU_ENABLE); + u32 val = 0; + int i; + + BUILD_BUG_ON(XE_MAX_EU_FUSE_REGS > 1); + + /* + * Pre-Xe_HP platforms inverted the bit meaning (disable instead + * of enable). + */ + if (GRAPHICS_VERx100(xe) < 1250) + reg_val = ~reg_val & XELP_EU_MASK; + + /* On PVC, one bit = one EU */ + if (GRAPHICS_VERx100(xe) == 1260) { + val = reg_val; + } else { + /* All other platforms, one bit = 2 EU */ + for (i = 0; i < fls(reg_val); i++) + if (reg_val & BIT(i)) + val |= 0x3 << 2 * i; + } + + bitmap_from_arr32(mask, &val, XE_MAX_EU_FUSE_BITS); +} + +static void +get_num_dss_regs(struct xe_device *xe, int *geometry_regs, int *compute_regs) +{ + if (GRAPHICS_VER(xe) > 20) { + *geometry_regs = 3; + *compute_regs = 3; + } else if (GRAPHICS_VERx100(xe) == 1260) { + *geometry_regs = 0; + *compute_regs = 2; + } else if (GRAPHICS_VERx100(xe) >= 1250) { + *geometry_regs = 1; + *compute_regs = 1; + } else { + *geometry_regs = 1; + *compute_regs = 0; + } +} + +void +xe_gt_topology_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + struct drm_printer p = drm_debug_printer("GT topology"); + int num_geometry_regs, num_compute_regs; + + get_num_dss_regs(xe, &num_geometry_regs, &num_compute_regs); + + /* + * Register counts returned shouldn't exceed the number of registers + * passed as parameters below. + */ + drm_WARN_ON(&xe->drm, num_geometry_regs > 3); + drm_WARN_ON(&xe->drm, num_compute_regs > 3); + + load_dss_mask(gt, gt->fuse_topo.g_dss_mask, + num_geometry_regs, + XELP_GT_GEOMETRY_DSS_ENABLE, + XE2_GT_GEOMETRY_DSS_1, + XE2_GT_GEOMETRY_DSS_2); + load_dss_mask(gt, gt->fuse_topo.c_dss_mask, num_compute_regs, + XEHP_GT_COMPUTE_DSS_ENABLE, + XEHPC_GT_COMPUTE_DSS_ENABLE_EXT, + XE2_GT_COMPUTE_DSS_2); + load_eu_mask(gt, gt->fuse_topo.eu_mask_per_dss); + + xe_gt_topology_dump(gt, &p); +} + +void +xe_gt_topology_dump(struct xe_gt *gt, struct drm_printer *p) +{ + drm_printf(p, "dss mask (geometry): %*pb\n", XE_MAX_DSS_FUSE_BITS, + gt->fuse_topo.g_dss_mask); + drm_printf(p, "dss mask (compute): %*pb\n", XE_MAX_DSS_FUSE_BITS, + gt->fuse_topo.c_dss_mask); + + drm_printf(p, "EU mask per DSS: %*pb\n", XE_MAX_EU_FUSE_BITS, + gt->fuse_topo.eu_mask_per_dss); + +} + +/* + * Used to obtain the index of the first DSS. Can start searching from the + * beginning of a specific dss group (e.g., gslice, cslice, etc.) if + * groupsize and groupnum are non-zero. + */ +unsigned int +xe_dss_mask_group_ffs(const xe_dss_mask_t mask, int groupsize, int groupnum) +{ + return find_next_bit(mask, XE_MAX_DSS_FUSE_BITS, groupnum * groupsize); +} + +bool xe_dss_mask_empty(const xe_dss_mask_t mask) +{ + return bitmap_empty(mask, XE_MAX_DSS_FUSE_BITS); +} + +/** + * xe_gt_topology_has_dss_in_quadrant - check fusing of DSS in GT quadrant + * @gt: GT to check + * @quad: Which quadrant of the DSS space to check + * + * Since Xe_HP platforms can have up to four CCS engines, those engines + * are each logically associated with a quarter of the possible DSS. If there + * are no DSS present in one of the four quadrants of the DSS space, the + * corresponding CCS engine is also not available for use. + * + * Returns false if all DSS in a quadrant of the GT are fused off, else true. + */ +bool xe_gt_topology_has_dss_in_quadrant(struct xe_gt *gt, int quad) +{ + struct xe_device *xe = gt_to_xe(gt); + xe_dss_mask_t all_dss; + int g_dss_regs, c_dss_regs, dss_per_quad, quad_first; + + bitmap_or(all_dss, gt->fuse_topo.g_dss_mask, gt->fuse_topo.c_dss_mask, + XE_MAX_DSS_FUSE_BITS); + + get_num_dss_regs(xe, &g_dss_regs, &c_dss_regs); + dss_per_quad = 32 * max(g_dss_regs, c_dss_regs) / 4; + + quad_first = xe_dss_mask_group_ffs(all_dss, dss_per_quad, quad); + + return quad_first < (quad + 1) * dss_per_quad; +} diff --git a/drivers/gpu/drm/xe/xe_gt_topology.h b/drivers/gpu/drm/xe/xe_gt_topology.h new file mode 100644 index 000000000000..d1b54fb52ea6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_topology.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GT_TOPOLOGY_H_ +#define _XE_GT_TOPOLOGY_H_ + +#include "xe_gt_types.h" + +struct drm_printer; + +void xe_gt_topology_init(struct xe_gt *gt); + +void xe_gt_topology_dump(struct xe_gt *gt, struct drm_printer *p); + +unsigned int +xe_dss_mask_group_ffs(const xe_dss_mask_t mask, int groupsize, int groupnum); + +bool xe_dss_mask_empty(const xe_dss_mask_t mask); + +bool +xe_gt_topology_has_dss_in_quadrant(struct xe_gt *gt, int quad); + +#endif /* _XE_GT_TOPOLOGY_H_ */ diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h new file mode 100644 index 000000000000..f74684660475 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -0,0 +1,363 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022-2023 Intel Corporation + */ + +#ifndef _XE_GT_TYPES_H_ +#define _XE_GT_TYPES_H_ + +#include "xe_force_wake_types.h" +#include "xe_gt_idle_types.h" +#include "xe_hw_engine_types.h" +#include "xe_hw_fence_types.h" +#include "xe_reg_sr_types.h" +#include "xe_sa_types.h" +#include "xe_uc_types.h" + +struct xe_exec_queue_ops; +struct xe_migrate; +struct xe_ring_ops; + +enum xe_gt_type { + XE_GT_TYPE_UNINITIALIZED, + XE_GT_TYPE_MAIN, + XE_GT_TYPE_MEDIA, +}; + +#define XE_MAX_DSS_FUSE_REGS 3 +#define XE_MAX_EU_FUSE_REGS 1 + +typedef unsigned long xe_dss_mask_t[BITS_TO_LONGS(32 * XE_MAX_DSS_FUSE_REGS)]; +typedef unsigned long xe_eu_mask_t[BITS_TO_LONGS(32 * XE_MAX_EU_FUSE_REGS)]; + +struct xe_mmio_range { + u32 start; + u32 end; +}; + +/* + * The hardware has multiple kinds of multicast register ranges that need + * special register steering (and future platforms are expected to add + * additional types). + * + * During driver startup, we initialize the steering control register to + * direct reads to a slice/subslice that are valid for the 'subslice' class + * of multicast registers. If another type of steering does not have any + * overlap in valid steering targets with 'subslice' style registers, we will + * need to explicitly re-steer reads of registers of the other type. + * + * Only the replication types that may need additional non-default steering + * are listed here. + */ +enum xe_steering_type { + L3BANK, + MSLICE, + LNCF, + DSS, + OADDRM, + SQIDI_PSMI, + + /* + * On some platforms there are multiple types of MCR registers that + * will always return a non-terminated value at instance (0, 0). We'll + * lump those all into a single category to keep things simple. + */ + INSTANCE0, + + /* + * Register ranges that don't need special steering for each register: + * it's sufficient to keep the HW-default for the selector, or only + * change it once, on GT initialization. This needs to be the last + * steering type. + */ + IMPLICIT_STEERING, + NUM_STEERING_TYPES +}; + +#define gt_to_tile(gt__) \ + _Generic(gt__, \ + const struct xe_gt * : (const struct xe_tile *)((gt__)->tile), \ + struct xe_gt * : (gt__)->tile) + +#define gt_to_xe(gt__) \ + _Generic(gt__, \ + const struct xe_gt * : (const struct xe_device *)(gt_to_tile(gt__)->xe), \ + struct xe_gt * : gt_to_tile(gt__)->xe) + +/** + * struct xe_gt - A "Graphics Technology" unit of the GPU + * + * A GT ("Graphics Technology") is the subset of a GPU primarily responsible + * for implementing the graphics, compute, and/or media IP. It encapsulates + * the hardware engines, programmable execution units, and GuC. Each GT has + * its own handling of power management (RC6+forcewake) and multicast register + * steering. + * + * A GPU/tile may have a single GT that supplies all graphics, compute, and + * media functionality, or the graphics/compute and media may be split into + * separate GTs within a tile. + */ +struct xe_gt { + /** @tile: Backpointer to GT's tile */ + struct xe_tile *tile; + + /** @info: GT info */ + struct { + /** @type: type of GT */ + enum xe_gt_type type; + /** @id: Unique ID of this GT within the PCI Device */ + u8 id; + /** @reference_clock: clock frequency */ + u32 reference_clock; + /** @engine_mask: mask of engines present on GT */ + u64 engine_mask; + /** + * @__engine_mask: mask of engines present on GT read from + * xe_pci.c, used to fake reading the engine_mask from the + * hwconfig blob. + */ + u64 __engine_mask; + } info; + + /** + * @mmio: mmio info for GT. All GTs within a tile share the same + * register space, but have their own copy of GSI registers at a + * specific offset, as well as their own forcewake handling. + */ + struct { + /** @fw: force wake for GT */ + struct xe_force_wake fw; + /** + * @adj_limit: adjust MMIO address if address is below this + * value + */ + u32 adj_limit; + /** @adj_offset: offect to add to MMIO address when adjusting */ + u32 adj_offset; + } mmio; + + /** + * @reg_sr: table with registers to be restored on GT init/resume/reset + */ + struct xe_reg_sr reg_sr; + + /** @reset: state for GT resets */ + struct { + /** + * @worker: work so GT resets can done async allowing to reset + * code to safely flush all code paths + */ + struct work_struct worker; + } reset; + + /** @tlb_invalidation: TLB invalidation state */ + struct { + /** @seqno: TLB invalidation seqno, protected by CT lock */ +#define TLB_INVALIDATION_SEQNO_MAX 0x100000 + int seqno; + /** + * @seqno_recv: last received TLB invalidation seqno, protected by CT lock + */ + int seqno_recv; + /** + * @pending_fences: list of pending fences waiting TLB + * invaliations, protected by CT lock + */ + struct list_head pending_fences; + /** + * @pending_lock: protects @pending_fences and updating + * @seqno_recv. + */ + spinlock_t pending_lock; + /** + * @fence_tdr: schedules a delayed call to + * xe_gt_tlb_fence_timeout after the timeut interval is over. + */ + struct delayed_work fence_tdr; + /** @fence_context: context for TLB invalidation fences */ + u64 fence_context; + /** + * @fence_seqno: seqno to TLB invalidation fences, protected by + * tlb_invalidation.lock + */ + u32 fence_seqno; + /** @lock: protects TLB invalidation fences */ + spinlock_t lock; + } tlb_invalidation; + + /** + * @ccs_mode: Number of compute engines enabled. + * Allows fixed mapping of available compute slices to compute engines. + * By default only the first available compute engine is enabled and all + * available compute slices are allocated to it. + */ + u32 ccs_mode; + + /** @usm: unified shared memory state */ + struct { + /** + * @bb_pool: Pool from which batchbuffers, for USM operations + * (e.g. migrations, fixing page tables), are allocated. + * Dedicated pool needed so USM operations to not get blocked + * behind any user operations which may have resulted in a + * fault. + */ + struct xe_sa_manager *bb_pool; + /** + * @reserved_bcs_instance: reserved BCS instance used for USM + * operations (e.g. mmigrations, fixing page tables) + */ + u16 reserved_bcs_instance; + /** @pf_wq: page fault work queue, unbound, high priority */ + struct workqueue_struct *pf_wq; + /** @acc_wq: access counter work queue, unbound, high priority */ + struct workqueue_struct *acc_wq; + /** + * @pf_queue: Page fault queue used to sync faults so faults can + * be processed not under the GuC CT lock. The queue is sized so + * it can sync all possible faults (1 per physical engine). + * Multiple queues exists for page faults from different VMs are + * be processed in parallel. + */ + struct pf_queue { + /** @gt: back pointer to GT */ + struct xe_gt *gt; +#define PF_QUEUE_NUM_DW 128 + /** @data: data in the page fault queue */ + u32 data[PF_QUEUE_NUM_DW]; + /** + * @head: head pointer in DWs for page fault queue, + * moved by worker which processes faults. + */ + u16 head; + /** + * @tail: tail pointer in DWs for page fault queue, + * moved by G2H handler. + */ + u16 tail; + /** @lock: protects page fault queue */ + spinlock_t lock; + /** @worker: to process page faults */ + struct work_struct worker; +#define NUM_PF_QUEUE 4 + } pf_queue[NUM_PF_QUEUE]; + /** + * @acc_queue: Same as page fault queue, cannot process access + * counters under CT lock. + */ + struct acc_queue { + /** @gt: back pointer to GT */ + struct xe_gt *gt; +#define ACC_QUEUE_NUM_DW 128 + /** @data: data in the page fault queue */ + u32 data[ACC_QUEUE_NUM_DW]; + /** + * @head: head pointer in DWs for page fault queue, + * moved by worker which processes faults. + */ + u16 head; + /** + * @tail: tail pointer in DWs for page fault queue, + * moved by G2H handler. + */ + u16 tail; + /** @lock: protects page fault queue */ + spinlock_t lock; + /** @worker: to process access counters */ + struct work_struct worker; +#define NUM_ACC_QUEUE 4 + } acc_queue[NUM_ACC_QUEUE]; + } usm; + + /** @ordered_wq: used to serialize GT resets and TDRs */ + struct workqueue_struct *ordered_wq; + + /** @uc: micro controllers on the GT */ + struct xe_uc uc; + + /** @gtidle: idle properties of GT */ + struct xe_gt_idle gtidle; + + /** @exec_queue_ops: submission backend exec queue operations */ + const struct xe_exec_queue_ops *exec_queue_ops; + + /** + * @ring_ops: ring operations for this hw engine (1 per engine class) + */ + const struct xe_ring_ops *ring_ops[XE_ENGINE_CLASS_MAX]; + + /** @fence_irq: fence IRQs (1 per engine class) */ + struct xe_hw_fence_irq fence_irq[XE_ENGINE_CLASS_MAX]; + + /** @default_lrc: default LRC state */ + void *default_lrc[XE_ENGINE_CLASS_MAX]; + + /** @hw_engines: hardware engines on the GT */ + struct xe_hw_engine hw_engines[XE_NUM_HW_ENGINES]; + + /** @eclass: per hardware engine class interface on the GT */ + struct xe_hw_engine_class_intf eclass[XE_ENGINE_CLASS_MAX]; + + /** @pcode: GT's PCODE */ + struct { + /** @lock: protecting GT's PCODE mailbox data */ + struct mutex lock; + } pcode; + + /** @sysfs: sysfs' kobj used by xe_gt_sysfs */ + struct kobject *sysfs; + + /** @freq: Main GT freq sysfs control */ + struct kobject *freq; + + /** @mocs: info */ + struct { + /** @uc_index: UC index */ + u8 uc_index; + /** @wb_index: WB index, only used on L3_CCS platforms */ + u8 wb_index; + } mocs; + + /** @fuse_topo: GT topology reported by fuse registers */ + struct { + /** @g_dss_mask: dual-subslices usable by geometry */ + xe_dss_mask_t g_dss_mask; + + /** @c_dss_mask: dual-subslices usable by compute */ + xe_dss_mask_t c_dss_mask; + + /** @eu_mask_per_dss: EU mask per DSS*/ + xe_eu_mask_t eu_mask_per_dss; + } fuse_topo; + + /** @steering: register steering for individual HW units */ + struct { + /* @ranges: register ranges used for this steering type */ + const struct xe_mmio_range *ranges; + + /** @group_target: target to steer accesses to */ + u16 group_target; + /** @instance_target: instance to steer accesses to */ + u16 instance_target; + } steering[NUM_STEERING_TYPES]; + + /** + * @mcr_lock: protects the MCR_SELECTOR register for the duration + * of a steered operation + */ + spinlock_t mcr_lock; + + /** @wa_active: keep track of active workarounds */ + struct { + /** @gt: bitmap with active GT workarounds */ + unsigned long *gt; + /** @engine: bitmap with active engine workarounds */ + unsigned long *engine; + /** @lrc: bitmap with active LRC workarounds */ + unsigned long *lrc; + /** @oob: bitmap with active OOB workaroudns */ + unsigned long *oob; + } wa_active; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c new file mode 100644 index 000000000000..482cb0df9f15 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -0,0 +1,911 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_guc.h" + +#include <drm/drm_managed.h> + +#include "abi/guc_actions_abi.h" +#include "abi/guc_errors_abi.h" +#include "generated/xe_wa_oob.h" +#include "regs/xe_gt_regs.h" +#include "regs/xe_guc_regs.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_force_wake.h" +#include "xe_gt.h" +#include "xe_guc_ads.h" +#include "xe_guc_ct.h" +#include "xe_guc_hwconfig.h" +#include "xe_guc_log.h" +#include "xe_guc_pc.h" +#include "xe_guc_submit.h" +#include "xe_mmio.h" +#include "xe_platform_types.h" +#include "xe_uc.h" +#include "xe_uc_fw.h" +#include "xe_wa.h" +#include "xe_wopcm.h" + +/* GuC addresses above GUC_GGTT_TOP also don't map through the GTT */ +#define GUC_GGTT_TOP 0xFEE00000 +static u32 guc_bo_ggtt_addr(struct xe_guc *guc, + struct xe_bo *bo) +{ + struct xe_device *xe = guc_to_xe(guc); + u32 addr = xe_bo_ggtt_addr(bo); + + xe_assert(xe, addr >= xe_wopcm_size(guc_to_xe(guc))); + xe_assert(xe, addr < GUC_GGTT_TOP); + xe_assert(xe, bo->size <= GUC_GGTT_TOP - addr); + + return addr; +} + +static u32 guc_ctl_debug_flags(struct xe_guc *guc) +{ + u32 level = xe_guc_log_get_level(&guc->log); + u32 flags = 0; + + if (!GUC_LOG_LEVEL_IS_VERBOSE(level)) + flags |= GUC_LOG_DISABLED; + else + flags |= GUC_LOG_LEVEL_TO_VERBOSITY(level) << + GUC_LOG_VERBOSITY_SHIFT; + + return flags; +} + +static u32 guc_ctl_feature_flags(struct xe_guc *guc) +{ + return GUC_CTL_ENABLE_SLPC; +} + +static u32 guc_ctl_log_params_flags(struct xe_guc *guc) +{ + u32 offset = guc_bo_ggtt_addr(guc, guc->log.bo) >> PAGE_SHIFT; + u32 flags; + + #if (((CRASH_BUFFER_SIZE) % SZ_1M) == 0) + #define LOG_UNIT SZ_1M + #define LOG_FLAG GUC_LOG_LOG_ALLOC_UNITS + #else + #define LOG_UNIT SZ_4K + #define LOG_FLAG 0 + #endif + + #if (((CAPTURE_BUFFER_SIZE) % SZ_1M) == 0) + #define CAPTURE_UNIT SZ_1M + #define CAPTURE_FLAG GUC_LOG_CAPTURE_ALLOC_UNITS + #else + #define CAPTURE_UNIT SZ_4K + #define CAPTURE_FLAG 0 + #endif + + BUILD_BUG_ON(!CRASH_BUFFER_SIZE); + BUILD_BUG_ON(!IS_ALIGNED(CRASH_BUFFER_SIZE, LOG_UNIT)); + BUILD_BUG_ON(!DEBUG_BUFFER_SIZE); + BUILD_BUG_ON(!IS_ALIGNED(DEBUG_BUFFER_SIZE, LOG_UNIT)); + BUILD_BUG_ON(!CAPTURE_BUFFER_SIZE); + BUILD_BUG_ON(!IS_ALIGNED(CAPTURE_BUFFER_SIZE, CAPTURE_UNIT)); + + BUILD_BUG_ON((CRASH_BUFFER_SIZE / LOG_UNIT - 1) > + (GUC_LOG_CRASH_MASK >> GUC_LOG_CRASH_SHIFT)); + BUILD_BUG_ON((DEBUG_BUFFER_SIZE / LOG_UNIT - 1) > + (GUC_LOG_DEBUG_MASK >> GUC_LOG_DEBUG_SHIFT)); + BUILD_BUG_ON((CAPTURE_BUFFER_SIZE / CAPTURE_UNIT - 1) > + (GUC_LOG_CAPTURE_MASK >> GUC_LOG_CAPTURE_SHIFT)); + + flags = GUC_LOG_VALID | + GUC_LOG_NOTIFY_ON_HALF_FULL | + CAPTURE_FLAG | + LOG_FLAG | + ((CRASH_BUFFER_SIZE / LOG_UNIT - 1) << GUC_LOG_CRASH_SHIFT) | + ((DEBUG_BUFFER_SIZE / LOG_UNIT - 1) << GUC_LOG_DEBUG_SHIFT) | + ((CAPTURE_BUFFER_SIZE / CAPTURE_UNIT - 1) << + GUC_LOG_CAPTURE_SHIFT) | + (offset << GUC_LOG_BUF_ADDR_SHIFT); + + #undef LOG_UNIT + #undef LOG_FLAG + #undef CAPTURE_UNIT + #undef CAPTURE_FLAG + + return flags; +} + +static u32 guc_ctl_ads_flags(struct xe_guc *guc) +{ + u32 ads = guc_bo_ggtt_addr(guc, guc->ads.bo) >> PAGE_SHIFT; + u32 flags = ads << GUC_ADS_ADDR_SHIFT; + + return flags; +} + +static u32 guc_ctl_wa_flags(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_gt *gt = guc_to_gt(guc); + u32 flags = 0; + + if (XE_WA(gt, 22012773006)) + flags |= GUC_WA_POLLCS; + + if (XE_WA(gt, 16011759253)) + flags |= GUC_WA_GAM_CREDITS; + + if (XE_WA(gt, 14014475959)) + flags |= GUC_WA_HOLD_CCS_SWITCHOUT; + + if (XE_WA(gt, 22011391025) || XE_WA(gt, 14012197797)) + flags |= GUC_WA_DUAL_QUEUE; + + /* + * Wa_22011802037: FIXME - there's more to be done than simply setting + * this flag: make sure each CS is stopped when preparing for GT reset + * and wait for pending MI_FW. + */ + if (GRAPHICS_VERx100(xe) < 1270) + flags |= GUC_WA_PRE_PARSER; + + if (XE_WA(gt, 16011777198)) + flags |= GUC_WA_RCS_RESET_BEFORE_RC6; + + if (XE_WA(gt, 22012727170) || XE_WA(gt, 22012727685)) + flags |= GUC_WA_CONTEXT_ISOLATION; + + if ((XE_WA(gt, 16015675438) || XE_WA(gt, 18020744125)) && + !xe_hw_engine_mask_per_class(gt, XE_ENGINE_CLASS_RENDER)) + flags |= GUC_WA_RCS_REGS_IN_CCS_REGS_LIST; + + if (XE_WA(gt, 1509372804)) + flags |= GUC_WA_RENDER_RST_RC6_EXIT; + + return flags; +} + +static u32 guc_ctl_devid(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + + return (((u32)xe->info.devid) << 16) | xe->info.revid; +} + +static void guc_init_params(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + u32 *params = guc->params; + int i; + + BUILD_BUG_ON(sizeof(guc->params) != GUC_CTL_MAX_DWORDS * sizeof(u32)); + BUILD_BUG_ON(GUC_CTL_MAX_DWORDS + 2 != SOFT_SCRATCH_COUNT); + + params[GUC_CTL_LOG_PARAMS] = guc_ctl_log_params_flags(guc); + params[GUC_CTL_FEATURE] = 0; + params[GUC_CTL_DEBUG] = guc_ctl_debug_flags(guc); + params[GUC_CTL_ADS] = guc_ctl_ads_flags(guc); + params[GUC_CTL_WA] = 0; + params[GUC_CTL_DEVID] = guc_ctl_devid(guc); + + for (i = 0; i < GUC_CTL_MAX_DWORDS; i++) + drm_dbg(&xe->drm, "GuC param[%2d] = 0x%08x\n", i, params[i]); +} + +static void guc_init_params_post_hwconfig(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + u32 *params = guc->params; + int i; + + BUILD_BUG_ON(sizeof(guc->params) != GUC_CTL_MAX_DWORDS * sizeof(u32)); + BUILD_BUG_ON(GUC_CTL_MAX_DWORDS + 2 != SOFT_SCRATCH_COUNT); + + params[GUC_CTL_LOG_PARAMS] = guc_ctl_log_params_flags(guc); + params[GUC_CTL_FEATURE] = guc_ctl_feature_flags(guc); + params[GUC_CTL_DEBUG] = guc_ctl_debug_flags(guc); + params[GUC_CTL_ADS] = guc_ctl_ads_flags(guc); + params[GUC_CTL_WA] = guc_ctl_wa_flags(guc); + params[GUC_CTL_DEVID] = guc_ctl_devid(guc); + + for (i = 0; i < GUC_CTL_MAX_DWORDS; i++) + drm_dbg(&xe->drm, "GuC param[%2d] = 0x%08x\n", i, params[i]); +} + +/* + * Initialize the GuC parameter block before starting the firmware + * transfer. These parameters are read by the firmware on startup + * and cannot be changed thereafter. + */ +static void guc_write_params(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + int i; + + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + + xe_mmio_write32(gt, SOFT_SCRATCH(0), 0); + + for (i = 0; i < GUC_CTL_MAX_DWORDS; i++) + xe_mmio_write32(gt, SOFT_SCRATCH(1 + i), guc->params[i]); +} + +static void guc_fini(struct drm_device *drm, void *arg) +{ + struct xe_guc *guc = arg; + + xe_force_wake_get(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL); + xe_guc_pc_fini(&guc->pc); + xe_uc_fini_hw(&guc_to_gt(guc)->uc); + xe_force_wake_put(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL); +} + +int xe_guc_init(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_gt *gt = guc_to_gt(guc); + int ret; + + guc->fw.type = XE_UC_FW_TYPE_GUC; + ret = xe_uc_fw_init(&guc->fw); + if (ret) + goto out; + + if (!xe_uc_fw_is_enabled(&guc->fw)) + return 0; + + ret = xe_guc_log_init(&guc->log); + if (ret) + goto out; + + ret = xe_guc_ads_init(&guc->ads); + if (ret) + goto out; + + ret = xe_guc_ct_init(&guc->ct); + if (ret) + goto out; + + ret = xe_guc_pc_init(&guc->pc); + if (ret) + goto out; + + ret = drmm_add_action_or_reset(>_to_xe(gt)->drm, guc_fini, guc); + if (ret) + goto out; + + guc_init_params(guc); + + if (xe_gt_is_media_type(gt)) + guc->notify_reg = MED_GUC_HOST_INTERRUPT; + else + guc->notify_reg = GUC_HOST_INTERRUPT; + + xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_LOADABLE); + + return 0; + +out: + drm_err(&xe->drm, "GuC init failed with %d", ret); + return ret; +} + +/** + * xe_guc_init_post_hwconfig - initialize GuC post hwconfig load + * @guc: The GuC object + * + * Return: 0 on success, negative error code on error. + */ +int xe_guc_init_post_hwconfig(struct xe_guc *guc) +{ + guc_init_params_post_hwconfig(guc); + + return xe_guc_ads_init_post_hwconfig(&guc->ads); +} + +int xe_guc_post_load_init(struct xe_guc *guc) +{ + xe_guc_ads_populate_post_load(&guc->ads); + guc->submission_state.enabled = true; + + return 0; +} + +int xe_guc_reset(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_gt *gt = guc_to_gt(guc); + u32 guc_status, gdrst; + int ret; + + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + + xe_mmio_write32(gt, GDRST, GRDOM_GUC); + + ret = xe_mmio_wait32(gt, GDRST, GRDOM_GUC, 0, 5000, &gdrst, false); + if (ret) { + drm_err(&xe->drm, "GuC reset timed out, GDRST=0x%8x\n", + gdrst); + goto err_out; + } + + guc_status = xe_mmio_read32(gt, GUC_STATUS); + if (!(guc_status & GS_MIA_IN_RESET)) { + drm_err(&xe->drm, + "GuC status: 0x%x, MIA core expected to be in reset\n", + guc_status); + ret = -EIO; + goto err_out; + } + + return 0; + +err_out: + + return ret; +} + +static void guc_prepare_xfer(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + struct xe_device *xe = guc_to_xe(guc); + u32 shim_flags = GUC_ENABLE_READ_CACHE_LOGIC | + GUC_ENABLE_READ_CACHE_FOR_SRAM_DATA | + GUC_ENABLE_READ_CACHE_FOR_WOPCM_DATA | + GUC_ENABLE_MIA_CLOCK_GATING; + + if (GRAPHICS_VERx100(xe) < 1250) + shim_flags |= GUC_DISABLE_SRAM_INIT_TO_ZEROES | + GUC_ENABLE_MIA_CACHING; + + if (GRAPHICS_VER(xe) >= 20 || xe->info.platform == XE_PVC) + shim_flags |= REG_FIELD_PREP(GUC_MOCS_INDEX_MASK, gt->mocs.uc_index); + + /* Must program this register before loading the ucode with DMA */ + xe_mmio_write32(gt, GUC_SHIM_CONTROL, shim_flags); + + xe_mmio_write32(gt, GT_PM_CONFIG, GT_DOORBELL_ENABLE); +} + +/* + * Supporting MMIO & in memory RSA + */ +static int guc_xfer_rsa(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + u32 rsa[UOS_RSA_SCRATCH_COUNT]; + size_t copied; + int i; + + if (guc->fw.rsa_size > 256) { + u32 rsa_ggtt_addr = xe_bo_ggtt_addr(guc->fw.bo) + + xe_uc_fw_rsa_offset(&guc->fw); + xe_mmio_write32(gt, UOS_RSA_SCRATCH(0), rsa_ggtt_addr); + return 0; + } + + copied = xe_uc_fw_copy_rsa(&guc->fw, rsa, sizeof(rsa)); + if (copied < sizeof(rsa)) + return -ENOMEM; + + for (i = 0; i < UOS_RSA_SCRATCH_COUNT; i++) + xe_mmio_write32(gt, UOS_RSA_SCRATCH(i), rsa[i]); + + return 0; +} + +static int guc_wait_ucode(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + u32 status; + int ret; + + /* + * Wait for the GuC to start up. + * NB: Docs recommend not using the interrupt for completion. + * Measurements indicate this should take no more than 20ms + * (assuming the GT clock is at maximum frequency). So, a + * timeout here indicates that the GuC has failed and is unusable. + * (Higher levels of the driver may decide to reset the GuC and + * attempt the ucode load again if this happens.) + * + * FIXME: There is a known (but exceedingly unlikely) race condition + * where the asynchronous frequency management code could reduce + * the GT clock while a GuC reload is in progress (during a full + * GT reset). A fix is in progress but there are complex locking + * issues to be resolved. In the meantime bump the timeout to + * 200ms. Even at slowest clock, this should be sufficient. And + * in the working case, a larger timeout makes no difference. + */ + ret = xe_mmio_wait32(guc_to_gt(guc), GUC_STATUS, GS_UKERNEL_MASK, + FIELD_PREP(GS_UKERNEL_MASK, XE_GUC_LOAD_STATUS_READY), + 200000, &status, false); + + if (ret) { + struct drm_device *drm = &xe->drm; + struct drm_printer p = drm_info_printer(drm->dev); + + drm_info(drm, "GuC load failed: status = 0x%08X\n", status); + drm_info(drm, "GuC load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n", + REG_FIELD_GET(GS_MIA_IN_RESET, status), + REG_FIELD_GET(GS_BOOTROM_MASK, status), + REG_FIELD_GET(GS_UKERNEL_MASK, status), + REG_FIELD_GET(GS_MIA_MASK, status), + REG_FIELD_GET(GS_AUTH_STATUS_MASK, status)); + + if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) { + drm_info(drm, "GuC firmware signature verification failed\n"); + ret = -ENOEXEC; + } + + if (REG_FIELD_GET(GS_UKERNEL_MASK, status) == + XE_GUC_LOAD_STATUS_EXCEPTION) { + drm_info(drm, "GuC firmware exception. EIP: %#x\n", + xe_mmio_read32(guc_to_gt(guc), + SOFT_SCRATCH(13))); + ret = -ENXIO; + } + + xe_guc_log_print(&guc->log, &p); + } else { + drm_dbg(&xe->drm, "GuC successfully loaded"); + } + + return ret; +} + +static int __xe_guc_upload(struct xe_guc *guc) +{ + int ret; + + guc_write_params(guc); + guc_prepare_xfer(guc); + + /* + * Note that GuC needs the CSS header plus uKernel code to be copied + * by the DMA engine in one operation, whereas the RSA signature is + * loaded separately, either by copying it to the UOS_RSA_SCRATCH + * register (if key size <= 256) or through a ggtt-pinned vma (if key + * size > 256). The RSA size and therefore the way we provide it to the + * HW is fixed for each platform and hard-coded in the bootrom. + */ + ret = guc_xfer_rsa(guc); + if (ret) + goto out; + /* + * Current uCode expects the code to be loaded at 8k; locations below + * this are used for the stack. + */ + ret = xe_uc_fw_upload(&guc->fw, 0x2000, UOS_MOVE); + if (ret) + goto out; + + /* Wait for authentication */ + ret = guc_wait_ucode(guc); + if (ret) + goto out; + + xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_RUNNING); + return 0; + +out: + xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_LOAD_FAIL); + return 0 /* FIXME: ret, don't want to stop load currently */; +} + +/** + * xe_guc_min_load_for_hwconfig - load minimal GuC and read hwconfig table + * @guc: The GuC object + * + * This function uploads a minimal GuC that does not support submissions but + * in a state where the hwconfig table can be read. Next, it reads and parses + * the hwconfig table so it can be used for subsequent steps in the driver load. + * Lastly, it enables CT communication (XXX: this is needed for PFs/VFs only). + * + * Return: 0 on success, negative error code on error. + */ +int xe_guc_min_load_for_hwconfig(struct xe_guc *guc) +{ + int ret; + + xe_guc_ads_populate_minimal(&guc->ads); + + ret = __xe_guc_upload(guc); + if (ret) + return ret; + + ret = xe_guc_hwconfig_init(guc); + if (ret) + return ret; + + ret = xe_guc_enable_communication(guc); + if (ret) + return ret; + + return 0; +} + +int xe_guc_upload(struct xe_guc *guc) +{ + xe_guc_ads_populate(&guc->ads); + + return __xe_guc_upload(guc); +} + +static void guc_handle_mmio_msg(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + u32 msg; + + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + + msg = xe_mmio_read32(gt, SOFT_SCRATCH(15)); + msg &= XE_GUC_RECV_MSG_EXCEPTION | + XE_GUC_RECV_MSG_CRASH_DUMP_POSTED; + xe_mmio_write32(gt, SOFT_SCRATCH(15), 0); + + if (msg & XE_GUC_RECV_MSG_CRASH_DUMP_POSTED) + drm_err(&guc_to_xe(guc)->drm, + "Received early GuC crash dump notification!\n"); + + if (msg & XE_GUC_RECV_MSG_EXCEPTION) + drm_err(&guc_to_xe(guc)->drm, + "Received early GuC exception notification!\n"); +} + +static void guc_enable_irq(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + u32 events = xe_gt_is_media_type(gt) ? + REG_FIELD_PREP(ENGINE0_MASK, GUC_INTR_GUC2HOST) : + REG_FIELD_PREP(ENGINE1_MASK, GUC_INTR_GUC2HOST); + + /* Primary GuC and media GuC share a single enable bit */ + xe_mmio_write32(gt, GUC_SG_INTR_ENABLE, + REG_FIELD_PREP(ENGINE1_MASK, GUC_INTR_GUC2HOST)); + + /* + * There are separate mask bits for primary and media GuCs, so use + * a RMW operation to avoid clobbering the other GuC's setting. + */ + xe_mmio_rmw32(gt, GUC_SG_INTR_MASK, events, 0); +} + +int xe_guc_enable_communication(struct xe_guc *guc) +{ + int err; + + guc_enable_irq(guc); + + xe_mmio_rmw32(guc_to_gt(guc), PMINTRMSK, + ARAT_EXPIRED_INTRMSK, 0); + + err = xe_guc_ct_enable(&guc->ct); + if (err) + return err; + + guc_handle_mmio_msg(guc); + + return 0; +} + +int xe_guc_suspend(struct xe_guc *guc) +{ + int ret; + u32 action[] = { + XE_GUC_ACTION_CLIENT_SOFT_RESET, + }; + + ret = xe_guc_mmio_send(guc, action, ARRAY_SIZE(action)); + if (ret) { + drm_err(&guc_to_xe(guc)->drm, + "GuC suspend: CLIENT_SOFT_RESET fail: %d!\n", ret); + return ret; + } + + xe_guc_sanitize(guc); + return 0; +} + +void xe_guc_notify(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + const u32 default_notify_data = 0; + + /* + * Both GUC_HOST_INTERRUPT and MED_GUC_HOST_INTERRUPT can pass + * additional payload data to the GuC but this capability is not + * used by the firmware yet. Use default value in the meantime. + */ + xe_mmio_write32(gt, guc->notify_reg, default_notify_data); +} + +int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr) +{ + u32 action[] = { + XE_GUC_ACTION_AUTHENTICATE_HUC, + rsa_addr + }; + + return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action)); +} + +int xe_guc_mmio_send_recv(struct xe_guc *guc, const u32 *request, + u32 len, u32 *response_buf) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_gt *gt = guc_to_gt(guc); + u32 header, reply; + struct xe_reg reply_reg = xe_gt_is_media_type(gt) ? + MED_VF_SW_FLAG(0) : VF_SW_FLAG(0); + const u32 LAST_INDEX = VF_SW_FLAG_COUNT - 1; + int ret; + int i; + + BUILD_BUG_ON(VF_SW_FLAG_COUNT != MED_VF_SW_FLAG_COUNT); + + xe_assert(xe, !guc->ct.enabled); + xe_assert(xe, len); + xe_assert(xe, len <= VF_SW_FLAG_COUNT); + xe_assert(xe, len <= MED_VF_SW_FLAG_COUNT); + xe_assert(xe, FIELD_GET(GUC_HXG_MSG_0_ORIGIN, request[0]) == + GUC_HXG_ORIGIN_HOST); + xe_assert(xe, FIELD_GET(GUC_HXG_MSG_0_TYPE, request[0]) == + GUC_HXG_TYPE_REQUEST); + +retry: + /* Not in critical data-path, just do if else for GT type */ + if (xe_gt_is_media_type(gt)) { + for (i = 0; i < len; ++i) + xe_mmio_write32(gt, MED_VF_SW_FLAG(i), + request[i]); + xe_mmio_read32(gt, MED_VF_SW_FLAG(LAST_INDEX)); + } else { + for (i = 0; i < len; ++i) + xe_mmio_write32(gt, VF_SW_FLAG(i), + request[i]); + xe_mmio_read32(gt, VF_SW_FLAG(LAST_INDEX)); + } + + xe_guc_notify(guc); + + ret = xe_mmio_wait32(gt, reply_reg, GUC_HXG_MSG_0_ORIGIN, + FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_GUC), + 50000, &reply, false); + if (ret) { +timeout: + drm_err(&xe->drm, "mmio request %#x: no reply %#x\n", + request[0], reply); + return ret; + } + + header = xe_mmio_read32(gt, reply_reg); + if (FIELD_GET(GUC_HXG_MSG_0_TYPE, header) == + GUC_HXG_TYPE_NO_RESPONSE_BUSY) { + /* + * Once we got a BUSY reply we must wait again for the final + * response but this time we can't use ORIGIN mask anymore. + * To spot a right change in the reply, we take advantage that + * response SUCCESS and FAILURE differ only by the single bit + * and all other bits are set and can be used as a new mask. + */ + u32 resp_bits = GUC_HXG_TYPE_RESPONSE_SUCCESS & GUC_HXG_TYPE_RESPONSE_FAILURE; + u32 resp_mask = FIELD_PREP(GUC_HXG_MSG_0_TYPE, resp_bits); + + BUILD_BUG_ON(FIELD_MAX(GUC_HXG_MSG_0_TYPE) != GUC_HXG_TYPE_RESPONSE_SUCCESS); + BUILD_BUG_ON((GUC_HXG_TYPE_RESPONSE_SUCCESS ^ GUC_HXG_TYPE_RESPONSE_FAILURE) != 1); + + ret = xe_mmio_wait32(gt, reply_reg, resp_mask, resp_mask, + 1000000, &header, false); + + if (unlikely(FIELD_GET(GUC_HXG_MSG_0_ORIGIN, header) != + GUC_HXG_ORIGIN_GUC)) + goto proto; + if (unlikely(ret)) + goto timeout; + } + + if (FIELD_GET(GUC_HXG_MSG_0_TYPE, header) == + GUC_HXG_TYPE_NO_RESPONSE_RETRY) { + u32 reason = FIELD_GET(GUC_HXG_RETRY_MSG_0_REASON, header); + + drm_dbg(&xe->drm, "mmio request %#x: retrying, reason %#x\n", + request[0], reason); + goto retry; + } + + if (FIELD_GET(GUC_HXG_MSG_0_TYPE, header) == + GUC_HXG_TYPE_RESPONSE_FAILURE) { + u32 hint = FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, header); + u32 error = FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, header); + + drm_err(&xe->drm, "mmio request %#x: failure %#x/%#x\n", + request[0], error, hint); + return -ENXIO; + } + + if (FIELD_GET(GUC_HXG_MSG_0_TYPE, header) != + GUC_HXG_TYPE_RESPONSE_SUCCESS) { +proto: + drm_err(&xe->drm, "mmio request %#x: unexpected reply %#x\n", + request[0], header); + return -EPROTO; + } + + /* Just copy entire possible message response */ + if (response_buf) { + response_buf[0] = header; + + for (i = 1; i < VF_SW_FLAG_COUNT; i++) { + reply_reg.addr += sizeof(u32); + response_buf[i] = xe_mmio_read32(gt, reply_reg); + } + } + + /* Use data from the GuC response as our return value */ + return FIELD_GET(GUC_HXG_RESPONSE_MSG_0_DATA0, header); +} + +int xe_guc_mmio_send(struct xe_guc *guc, const u32 *request, u32 len) +{ + return xe_guc_mmio_send_recv(guc, request, len, NULL); +} + +static int guc_self_cfg(struct xe_guc *guc, u16 key, u16 len, u64 val) +{ + struct xe_device *xe = guc_to_xe(guc); + u32 request[HOST2GUC_SELF_CFG_REQUEST_MSG_LEN] = { + FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) | + FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) | + FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, + GUC_ACTION_HOST2GUC_SELF_CFG), + FIELD_PREP(HOST2GUC_SELF_CFG_REQUEST_MSG_1_KLV_KEY, key) | + FIELD_PREP(HOST2GUC_SELF_CFG_REQUEST_MSG_1_KLV_LEN, len), + FIELD_PREP(HOST2GUC_SELF_CFG_REQUEST_MSG_2_VALUE32, + lower_32_bits(val)), + FIELD_PREP(HOST2GUC_SELF_CFG_REQUEST_MSG_3_VALUE64, + upper_32_bits(val)), + }; + int ret; + + xe_assert(xe, len <= 2); + xe_assert(xe, len != 1 || !upper_32_bits(val)); + + /* Self config must go over MMIO */ + ret = xe_guc_mmio_send(guc, request, ARRAY_SIZE(request)); + + if (unlikely(ret < 0)) + return ret; + if (unlikely(ret > 1)) + return -EPROTO; + if (unlikely(!ret)) + return -ENOKEY; + + return 0; +} + +int xe_guc_self_cfg32(struct xe_guc *guc, u16 key, u32 val) +{ + return guc_self_cfg(guc, key, 1, val); +} + +int xe_guc_self_cfg64(struct xe_guc *guc, u16 key, u64 val) +{ + return guc_self_cfg(guc, key, 2, val); +} + +void xe_guc_irq_handler(struct xe_guc *guc, const u16 iir) +{ + if (iir & GUC_INTR_GUC2HOST) + xe_guc_ct_irq_handler(&guc->ct); +} + +void xe_guc_sanitize(struct xe_guc *guc) +{ + xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_LOADABLE); + xe_guc_ct_disable(&guc->ct); + guc->submission_state.enabled = false; +} + +int xe_guc_reset_prepare(struct xe_guc *guc) +{ + return xe_guc_submit_reset_prepare(guc); +} + +void xe_guc_reset_wait(struct xe_guc *guc) +{ + xe_guc_submit_reset_wait(guc); +} + +void xe_guc_stop_prepare(struct xe_guc *guc) +{ + XE_WARN_ON(xe_guc_pc_stop(&guc->pc)); +} + +int xe_guc_stop(struct xe_guc *guc) +{ + int ret; + + xe_guc_ct_disable(&guc->ct); + + ret = xe_guc_submit_stop(guc); + if (ret) + return ret; + + return 0; +} + +int xe_guc_start(struct xe_guc *guc) +{ + int ret; + + ret = xe_guc_pc_start(&guc->pc); + XE_WARN_ON(ret); + + return xe_guc_submit_start(guc); +} + +void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p) +{ + struct xe_gt *gt = guc_to_gt(guc); + u32 status; + int err; + int i; + + xe_uc_fw_print(&guc->fw, p); + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + return; + + status = xe_mmio_read32(gt, GUC_STATUS); + + drm_printf(p, "\nGuC status 0x%08x:\n", status); + drm_printf(p, "\tBootrom status = 0x%x\n", + REG_FIELD_GET(GS_BOOTROM_MASK, status)); + drm_printf(p, "\tuKernel status = 0x%x\n", + REG_FIELD_GET(GS_UKERNEL_MASK, status)); + drm_printf(p, "\tMIA Core status = 0x%x\n", + REG_FIELD_GET(GS_MIA_MASK, status)); + drm_printf(p, "\tLog level = %d\n", + xe_guc_log_get_level(&guc->log)); + + drm_puts(p, "\nScratch registers:\n"); + for (i = 0; i < SOFT_SCRATCH_COUNT; i++) { + drm_printf(p, "\t%2d: \t0x%x\n", + i, xe_mmio_read32(gt, SOFT_SCRATCH(i))); + } + + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); + + xe_guc_ct_print(&guc->ct, p, false); + xe_guc_submit_print(guc, p); +} + +/** + * xe_guc_in_reset() - Detect if GuC MIA is in reset. + * @guc: The GuC object + * + * This function detects runtime resume from d3cold by leveraging + * GUC_STATUS, GUC doesn't get reset during d3hot, + * it strictly to be called from RPM resume handler. + * + * Return: true if failed to get forcewake or GuC MIA is in Reset, + * otherwise false. + */ +bool xe_guc_in_reset(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + u32 status; + int err; + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + return true; + + status = xe_mmio_read32(gt, GUC_STATUS); + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); + + return status & GS_MIA_IN_RESET; +} diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h new file mode 100644 index 000000000000..d3e49e7fd7c3 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_H_ +#define _XE_GUC_H_ + +#include "xe_gt.h" +#include "xe_guc_types.h" +#include "xe_hw_engine_types.h" +#include "xe_macros.h" + +struct drm_printer; + +int xe_guc_init(struct xe_guc *guc); +int xe_guc_init_post_hwconfig(struct xe_guc *guc); +int xe_guc_post_load_init(struct xe_guc *guc); +int xe_guc_reset(struct xe_guc *guc); +int xe_guc_upload(struct xe_guc *guc); +int xe_guc_min_load_for_hwconfig(struct xe_guc *guc); +int xe_guc_enable_communication(struct xe_guc *guc); +int xe_guc_suspend(struct xe_guc *guc); +void xe_guc_notify(struct xe_guc *guc); +int xe_guc_auth_huc(struct xe_guc *guc, u32 rsa_addr); +int xe_guc_mmio_send(struct xe_guc *guc, const u32 *request, u32 len); +int xe_guc_mmio_send_recv(struct xe_guc *guc, const u32 *request, u32 len, + u32 *response_buf); +int xe_guc_self_cfg32(struct xe_guc *guc, u16 key, u32 val); +int xe_guc_self_cfg64(struct xe_guc *guc, u16 key, u64 val); +void xe_guc_irq_handler(struct xe_guc *guc, const u16 iir); +void xe_guc_sanitize(struct xe_guc *guc); +void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p); +int xe_guc_reset_prepare(struct xe_guc *guc); +void xe_guc_reset_wait(struct xe_guc *guc); +void xe_guc_stop_prepare(struct xe_guc *guc); +int xe_guc_stop(struct xe_guc *guc); +int xe_guc_start(struct xe_guc *guc); +bool xe_guc_in_reset(struct xe_guc *guc); + +static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class) +{ + switch (class) { + case XE_ENGINE_CLASS_RENDER: + return GUC_RENDER_CLASS; + case XE_ENGINE_CLASS_VIDEO_DECODE: + return GUC_VIDEO_CLASS; + case XE_ENGINE_CLASS_VIDEO_ENHANCE: + return GUC_VIDEOENHANCE_CLASS; + case XE_ENGINE_CLASS_COPY: + return GUC_BLITTER_CLASS; + case XE_ENGINE_CLASS_COMPUTE: + return GUC_COMPUTE_CLASS; + case XE_ENGINE_CLASS_OTHER: + return GUC_GSC_OTHER_CLASS; + default: + XE_WARN_ON(class); + return -1; + } +} + +static inline struct xe_gt *guc_to_gt(struct xe_guc *guc) +{ + return container_of(guc, struct xe_gt, uc.guc); +} + +static inline struct xe_device *guc_to_xe(struct xe_guc *guc) +{ + return gt_to_xe(guc_to_gt(guc)); +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c new file mode 100644 index 000000000000..390e6f1bf4e1 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_ads.c @@ -0,0 +1,672 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_guc_ads.h" + +#include <drm/drm_managed.h> + +#include "regs/xe_engine_regs.h" +#include "regs/xe_gt_regs.h" +#include "regs/xe_guc_regs.h" +#include "xe_bo.h" +#include "xe_gt.h" +#include "xe_gt_ccs_mode.h" +#include "xe_guc.h" +#include "xe_hw_engine.h" +#include "xe_lrc.h" +#include "xe_map.h" +#include "xe_mmio.h" +#include "xe_platform_types.h" + +/* Slack of a few additional entries per engine */ +#define ADS_REGSET_EXTRA_MAX 8 + +static struct xe_guc * +ads_to_guc(struct xe_guc_ads *ads) +{ + return container_of(ads, struct xe_guc, ads); +} + +static struct xe_gt * +ads_to_gt(struct xe_guc_ads *ads) +{ + return container_of(ads, struct xe_gt, uc.guc.ads); +} + +static struct xe_device * +ads_to_xe(struct xe_guc_ads *ads) +{ + return gt_to_xe(ads_to_gt(ads)); +} + +static struct iosys_map * +ads_to_map(struct xe_guc_ads *ads) +{ + return &ads->bo->vmap; +} + +/* UM Queue parameters: */ +#define GUC_UM_QUEUE_SIZE (SZ_64K) +#define GUC_PAGE_RES_TIMEOUT_US (-1) + +/* + * The Additional Data Struct (ADS) has pointers for different buffers used by + * the GuC. One single gem object contains the ADS struct itself (guc_ads) and + * all the extra buffers indirectly linked via the ADS struct's entries. + * + * Layout of the ADS blob allocated for the GuC: + * + * +---------------------------------------+ <== base + * | guc_ads | + * +---------------------------------------+ + * | guc_policies | + * +---------------------------------------+ + * | guc_gt_system_info | + * +---------------------------------------+ + * | guc_engine_usage | + * +---------------------------------------+ + * | guc_um_init_params | + * +---------------------------------------+ <== static + * | guc_mmio_reg[countA] (engine 0.0) | + * | guc_mmio_reg[countB] (engine 0.1) | + * | guc_mmio_reg[countC] (engine 1.0) | + * | ... | + * +---------------------------------------+ <== dynamic + * | padding | + * +---------------------------------------+ <== 4K aligned + * | golden contexts | + * +---------------------------------------+ + * | padding | + * +---------------------------------------+ <== 4K aligned + * | capture lists | + * +---------------------------------------+ + * | padding | + * +---------------------------------------+ <== 4K aligned + * | UM queues | + * +---------------------------------------+ + * | padding | + * +---------------------------------------+ <== 4K aligned + * | private data | + * +---------------------------------------+ + * | padding | + * +---------------------------------------+ <== 4K aligned + */ +struct __guc_ads_blob { + struct guc_ads ads; + struct guc_policies policies; + struct guc_gt_system_info system_info; + struct guc_engine_usage engine_usage; + struct guc_um_init_params um_init_params; + /* From here on, location is dynamic! Refer to above diagram. */ + struct guc_mmio_reg regset[0]; +} __packed; + +#define ads_blob_read(ads_, field_) \ + xe_map_rd_field(ads_to_xe(ads_), ads_to_map(ads_), 0, \ + struct __guc_ads_blob, field_) + +#define ads_blob_write(ads_, field_, val_) \ + xe_map_wr_field(ads_to_xe(ads_), ads_to_map(ads_), 0, \ + struct __guc_ads_blob, field_, val_) + +#define info_map_write(xe_, map_, field_, val_) \ + xe_map_wr_field(xe_, map_, 0, struct guc_gt_system_info, field_, val_) + +#define info_map_read(xe_, map_, field_) \ + xe_map_rd_field(xe_, map_, 0, struct guc_gt_system_info, field_) + +static size_t guc_ads_regset_size(struct xe_guc_ads *ads) +{ + struct xe_device *xe = ads_to_xe(ads); + + xe_assert(xe, ads->regset_size); + + return ads->regset_size; +} + +static size_t guc_ads_golden_lrc_size(struct xe_guc_ads *ads) +{ + return PAGE_ALIGN(ads->golden_lrc_size); +} + +static size_t guc_ads_capture_size(struct xe_guc_ads *ads) +{ + /* FIXME: Allocate a proper capture list */ + return PAGE_ALIGN(PAGE_SIZE); +} + +static size_t guc_ads_um_queues_size(struct xe_guc_ads *ads) +{ + struct xe_device *xe = ads_to_xe(ads); + + if (!xe->info.has_usm) + return 0; + + return GUC_UM_QUEUE_SIZE * GUC_UM_HW_QUEUE_MAX; +} + +static size_t guc_ads_private_data_size(struct xe_guc_ads *ads) +{ + return PAGE_ALIGN(ads_to_guc(ads)->fw.private_data_size); +} + +static size_t guc_ads_regset_offset(struct xe_guc_ads *ads) +{ + return offsetof(struct __guc_ads_blob, regset); +} + +static size_t guc_ads_golden_lrc_offset(struct xe_guc_ads *ads) +{ + size_t offset; + + offset = guc_ads_regset_offset(ads) + + guc_ads_regset_size(ads); + + return PAGE_ALIGN(offset); +} + +static size_t guc_ads_capture_offset(struct xe_guc_ads *ads) +{ + size_t offset; + + offset = guc_ads_golden_lrc_offset(ads) + + guc_ads_golden_lrc_size(ads); + + return PAGE_ALIGN(offset); +} + +static size_t guc_ads_um_queues_offset(struct xe_guc_ads *ads) +{ + u32 offset; + + offset = guc_ads_capture_offset(ads) + + guc_ads_capture_size(ads); + + return PAGE_ALIGN(offset); +} + +static size_t guc_ads_private_data_offset(struct xe_guc_ads *ads) +{ + size_t offset; + + offset = guc_ads_um_queues_offset(ads) + + guc_ads_um_queues_size(ads); + + return PAGE_ALIGN(offset); +} + +static size_t guc_ads_size(struct xe_guc_ads *ads) +{ + return guc_ads_private_data_offset(ads) + + guc_ads_private_data_size(ads); +} + +static bool needs_wa_1607983814(struct xe_device *xe) +{ + return GRAPHICS_VERx100(xe) < 1250; +} + +static size_t calculate_regset_size(struct xe_gt *gt) +{ + struct xe_reg_sr_entry *sr_entry; + unsigned long sr_idx; + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + unsigned int count = 0; + + for_each_hw_engine(hwe, gt, id) + xa_for_each(&hwe->reg_sr.xa, sr_idx, sr_entry) + count++; + + count += ADS_REGSET_EXTRA_MAX * XE_NUM_HW_ENGINES; + + if (needs_wa_1607983814(gt_to_xe(gt))) + count += LNCFCMOCS_REG_COUNT; + + return count * sizeof(struct guc_mmio_reg); +} + +static u32 engine_enable_mask(struct xe_gt *gt, enum xe_engine_class class) +{ + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + u32 mask = 0; + + for_each_hw_engine(hwe, gt, id) + if (hwe->class == class) + mask |= BIT(hwe->instance); + + return mask; +} + +static size_t calculate_golden_lrc_size(struct xe_guc_ads *ads) +{ + struct xe_device *xe = ads_to_xe(ads); + struct xe_gt *gt = ads_to_gt(ads); + size_t total_size = 0, alloc_size, real_size; + int class; + + for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) { + if (!engine_enable_mask(gt, class)) + continue; + + real_size = xe_lrc_size(xe, class); + alloc_size = PAGE_ALIGN(real_size); + total_size += alloc_size; + } + + return total_size; +} + +#define MAX_GOLDEN_LRC_SIZE (SZ_4K * 64) + +int xe_guc_ads_init(struct xe_guc_ads *ads) +{ + struct xe_device *xe = ads_to_xe(ads); + struct xe_gt *gt = ads_to_gt(ads); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_bo *bo; + + ads->golden_lrc_size = calculate_golden_lrc_size(ads); + ads->regset_size = calculate_regset_size(gt); + + bo = xe_managed_bo_create_pin_map(xe, tile, guc_ads_size(ads) + MAX_GOLDEN_LRC_SIZE, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + ads->bo = bo; + + return 0; +} + +/** + * xe_guc_ads_init_post_hwconfig - initialize ADS post hwconfig load + * @ads: Additional data structures object + * + * Recalcuate golden_lrc_size & regset_size as the number hardware engines may + * have changed after the hwconfig was loaded. Also verify the new sizes fit in + * the already allocated ADS buffer object. + * + * Return: 0 on success, negative error code on error. + */ +int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads) +{ + struct xe_gt *gt = ads_to_gt(ads); + u32 prev_regset_size = ads->regset_size; + + xe_gt_assert(gt, ads->bo); + + ads->golden_lrc_size = calculate_golden_lrc_size(ads); + ads->regset_size = calculate_regset_size(gt); + + xe_gt_assert(gt, ads->golden_lrc_size + + (ads->regset_size - prev_regset_size) <= + MAX_GOLDEN_LRC_SIZE); + + return 0; +} + +static void guc_policies_init(struct xe_guc_ads *ads) +{ + ads_blob_write(ads, policies.dpc_promote_time, + GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US); + ads_blob_write(ads, policies.max_num_work_items, + GLOBAL_POLICY_MAX_NUM_WI); + ads_blob_write(ads, policies.global_flags, 0); + ads_blob_write(ads, policies.is_valid, 1); +} + +static void fill_engine_enable_masks(struct xe_gt *gt, + struct iosys_map *info_map) +{ + struct xe_device *xe = gt_to_xe(gt); + + info_map_write(xe, info_map, engine_enabled_masks[GUC_RENDER_CLASS], + engine_enable_mask(gt, XE_ENGINE_CLASS_RENDER)); + info_map_write(xe, info_map, engine_enabled_masks[GUC_BLITTER_CLASS], + engine_enable_mask(gt, XE_ENGINE_CLASS_COPY)); + info_map_write(xe, info_map, engine_enabled_masks[GUC_VIDEO_CLASS], + engine_enable_mask(gt, XE_ENGINE_CLASS_VIDEO_DECODE)); + info_map_write(xe, info_map, + engine_enabled_masks[GUC_VIDEOENHANCE_CLASS], + engine_enable_mask(gt, XE_ENGINE_CLASS_VIDEO_ENHANCE)); + info_map_write(xe, info_map, engine_enabled_masks[GUC_COMPUTE_CLASS], + engine_enable_mask(gt, XE_ENGINE_CLASS_COMPUTE)); + info_map_write(xe, info_map, engine_enabled_masks[GUC_GSC_OTHER_CLASS], + engine_enable_mask(gt, XE_ENGINE_CLASS_OTHER)); +} + +static void guc_prep_golden_lrc_null(struct xe_guc_ads *ads) +{ + struct xe_device *xe = ads_to_xe(ads); + struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads), + offsetof(struct __guc_ads_blob, system_info)); + u8 guc_class; + + for (guc_class = 0; guc_class <= GUC_MAX_ENGINE_CLASSES; ++guc_class) { + if (!info_map_read(xe, &info_map, + engine_enabled_masks[guc_class])) + continue; + + ads_blob_write(ads, ads.eng_state_size[guc_class], + guc_ads_golden_lrc_size(ads) - + xe_lrc_skip_size(xe)); + ads_blob_write(ads, ads.golden_context_lrca[guc_class], + xe_bo_ggtt_addr(ads->bo) + + guc_ads_golden_lrc_offset(ads)); + } +} + +static void guc_mapping_table_init_invalid(struct xe_gt *gt, + struct iosys_map *info_map) +{ + struct xe_device *xe = gt_to_xe(gt); + unsigned int i, j; + + /* Table must be set to invalid values for entries not used */ + for (i = 0; i < GUC_MAX_ENGINE_CLASSES; ++i) + for (j = 0; j < GUC_MAX_INSTANCES_PER_CLASS; ++j) + info_map_write(xe, info_map, mapping_table[i][j], + GUC_MAX_INSTANCES_PER_CLASS); +} + +static void guc_mapping_table_init(struct xe_gt *gt, + struct iosys_map *info_map) +{ + struct xe_device *xe = gt_to_xe(gt); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + + guc_mapping_table_init_invalid(gt, info_map); + + for_each_hw_engine(hwe, gt, id) { + u8 guc_class; + + guc_class = xe_engine_class_to_guc_class(hwe->class); + info_map_write(xe, info_map, + mapping_table[guc_class][hwe->logical_instance], + hwe->instance); + } +} + +static void guc_capture_list_init(struct xe_guc_ads *ads) +{ + int i, j; + u32 addr = xe_bo_ggtt_addr(ads->bo) + guc_ads_capture_offset(ads); + + /* FIXME: Populate a proper capture list */ + for (i = 0; i < GUC_CAPTURE_LIST_INDEX_MAX; i++) { + for (j = 0; j < GUC_MAX_ENGINE_CLASSES; j++) { + ads_blob_write(ads, ads.capture_instance[i][j], addr); + ads_blob_write(ads, ads.capture_class[i][j], addr); + } + + ads_blob_write(ads, ads.capture_global[i], addr); + } +} + +static void guc_mmio_regset_write_one(struct xe_guc_ads *ads, + struct iosys_map *regset_map, + struct xe_reg reg, + unsigned int n_entry) +{ + struct guc_mmio_reg entry = { + .offset = reg.addr, + .flags = reg.masked ? GUC_REGSET_MASKED : 0, + }; + + xe_map_memcpy_to(ads_to_xe(ads), regset_map, n_entry * sizeof(entry), + &entry, sizeof(entry)); +} + +static unsigned int guc_mmio_regset_write(struct xe_guc_ads *ads, + struct iosys_map *regset_map, + struct xe_hw_engine *hwe) +{ + struct xe_device *xe = ads_to_xe(ads); + struct xe_hw_engine *hwe_rcs_reset_domain = + xe_gt_any_hw_engine_by_reset_domain(hwe->gt, XE_ENGINE_CLASS_RENDER); + struct xe_reg_sr_entry *entry; + unsigned long idx; + unsigned int count = 0; + const struct { + struct xe_reg reg; + bool skip; + } *e, extra_regs[] = { + { .reg = RING_MODE(hwe->mmio_base), }, + { .reg = RING_HWS_PGA(hwe->mmio_base), }, + { .reg = RING_IMR(hwe->mmio_base), }, + { .reg = RCU_MODE, .skip = hwe != hwe_rcs_reset_domain }, + { .reg = CCS_MODE, + .skip = hwe != hwe_rcs_reset_domain || !xe_gt_ccs_mode_enabled(hwe->gt) }, + }; + u32 i; + + BUILD_BUG_ON(ARRAY_SIZE(extra_regs) > ADS_REGSET_EXTRA_MAX); + + xa_for_each(&hwe->reg_sr.xa, idx, entry) + guc_mmio_regset_write_one(ads, regset_map, entry->reg, count++); + + for (e = extra_regs; e < extra_regs + ARRAY_SIZE(extra_regs); e++) { + if (e->skip) + continue; + + guc_mmio_regset_write_one(ads, regset_map, e->reg, count++); + } + + /* Wa_1607983814 */ + if (needs_wa_1607983814(xe) && hwe->class == XE_ENGINE_CLASS_RENDER) { + for (i = 0; i < LNCFCMOCS_REG_COUNT; i++) { + guc_mmio_regset_write_one(ads, regset_map, + XELP_LNCFCMOCS(i), count++); + } + } + + return count; +} + +static void guc_mmio_reg_state_init(struct xe_guc_ads *ads) +{ + size_t regset_offset = guc_ads_regset_offset(ads); + struct xe_gt *gt = ads_to_gt(ads); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + u32 addr = xe_bo_ggtt_addr(ads->bo) + regset_offset; + struct iosys_map regset_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads), + regset_offset); + unsigned int regset_used = 0; + + for_each_hw_engine(hwe, gt, id) { + unsigned int count; + u8 gc; + + /* + * 1. Write all MMIO entries for this exec queue to the table. No + * need to worry about fused-off engines and when there are + * entries in the regset: the reg_state_list has been zero'ed + * by xe_guc_ads_populate() + */ + count = guc_mmio_regset_write(ads, ®set_map, hwe); + if (!count) + continue; + + /* + * 2. Record in the header (ads.reg_state_list) the address + * location and number of entries + */ + gc = xe_engine_class_to_guc_class(hwe->class); + ads_blob_write(ads, ads.reg_state_list[gc][hwe->instance].address, addr); + ads_blob_write(ads, ads.reg_state_list[gc][hwe->instance].count, count); + + addr += count * sizeof(struct guc_mmio_reg); + iosys_map_incr(®set_map, count * sizeof(struct guc_mmio_reg)); + + regset_used += count * sizeof(struct guc_mmio_reg); + } + + xe_gt_assert(gt, regset_used <= ads->regset_size); +} + +static void guc_um_init_params(struct xe_guc_ads *ads) +{ + u32 um_queue_offset = guc_ads_um_queues_offset(ads); + u64 base_dpa; + u32 base_ggtt; + int i; + + base_ggtt = xe_bo_ggtt_addr(ads->bo) + um_queue_offset; + base_dpa = xe_bo_main_addr(ads->bo, PAGE_SIZE) + um_queue_offset; + + for (i = 0; i < GUC_UM_HW_QUEUE_MAX; ++i) { + ads_blob_write(ads, um_init_params.queue_params[i].base_dpa, + base_dpa + (i * GUC_UM_QUEUE_SIZE)); + ads_blob_write(ads, um_init_params.queue_params[i].base_ggtt_address, + base_ggtt + (i * GUC_UM_QUEUE_SIZE)); + ads_blob_write(ads, um_init_params.queue_params[i].size_in_bytes, + GUC_UM_QUEUE_SIZE); + } + + ads_blob_write(ads, um_init_params.page_response_timeout_in_us, + GUC_PAGE_RES_TIMEOUT_US); +} + +static void guc_doorbell_init(struct xe_guc_ads *ads) +{ + struct xe_device *xe = ads_to_xe(ads); + struct xe_gt *gt = ads_to_gt(ads); + + if (GRAPHICS_VER(xe) >= 12 && !IS_DGFX(xe)) { + u32 distdbreg = + xe_mmio_read32(gt, DIST_DBS_POPULATED); + + ads_blob_write(ads, + system_info.generic_gt_sysinfo[GUC_GENERIC_GT_SYSINFO_DOORBELL_COUNT_PER_SQIDI], + REG_FIELD_GET(DOORBELLS_PER_SQIDI_MASK, distdbreg) + 1); + } +} + +/** + * xe_guc_ads_populate_minimal - populate minimal ADS + * @ads: Additional data structures object + * + * This function populates a minimal ADS that does not support submissions but + * enough so the GuC can load and the hwconfig table can be read. + */ +void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads) +{ + struct xe_gt *gt = ads_to_gt(ads); + struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads), + offsetof(struct __guc_ads_blob, system_info)); + u32 base = xe_bo_ggtt_addr(ads->bo); + + xe_gt_assert(gt, ads->bo); + + xe_map_memset(ads_to_xe(ads), ads_to_map(ads), 0, 0, ads->bo->size); + guc_policies_init(ads); + guc_prep_golden_lrc_null(ads); + guc_mapping_table_init_invalid(gt, &info_map); + guc_doorbell_init(ads); + + ads_blob_write(ads, ads.scheduler_policies, base + + offsetof(struct __guc_ads_blob, policies)); + ads_blob_write(ads, ads.gt_system_info, base + + offsetof(struct __guc_ads_blob, system_info)); + ads_blob_write(ads, ads.private_data, base + + guc_ads_private_data_offset(ads)); +} + +void xe_guc_ads_populate(struct xe_guc_ads *ads) +{ + struct xe_device *xe = ads_to_xe(ads); + struct xe_gt *gt = ads_to_gt(ads); + struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads), + offsetof(struct __guc_ads_blob, system_info)); + u32 base = xe_bo_ggtt_addr(ads->bo); + + xe_gt_assert(gt, ads->bo); + + xe_map_memset(ads_to_xe(ads), ads_to_map(ads), 0, 0, ads->bo->size); + guc_policies_init(ads); + fill_engine_enable_masks(gt, &info_map); + guc_mmio_reg_state_init(ads); + guc_prep_golden_lrc_null(ads); + guc_mapping_table_init(gt, &info_map); + guc_capture_list_init(ads); + guc_doorbell_init(ads); + + if (xe->info.has_usm) { + guc_um_init_params(ads); + ads_blob_write(ads, ads.um_init_data, base + + offsetof(struct __guc_ads_blob, um_init_params)); + } + + ads_blob_write(ads, ads.scheduler_policies, base + + offsetof(struct __guc_ads_blob, policies)); + ads_blob_write(ads, ads.gt_system_info, base + + offsetof(struct __guc_ads_blob, system_info)); + ads_blob_write(ads, ads.private_data, base + + guc_ads_private_data_offset(ads)); +} + +static void guc_populate_golden_lrc(struct xe_guc_ads *ads) +{ + struct xe_device *xe = ads_to_xe(ads); + struct xe_gt *gt = ads_to_gt(ads); + struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads), + offsetof(struct __guc_ads_blob, system_info)); + size_t total_size = 0, alloc_size, real_size; + u32 addr_ggtt, offset; + int class; + + offset = guc_ads_golden_lrc_offset(ads); + addr_ggtt = xe_bo_ggtt_addr(ads->bo) + offset; + + for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) { + u8 guc_class; + + guc_class = xe_engine_class_to_guc_class(class); + + if (!info_map_read(xe, &info_map, + engine_enabled_masks[guc_class])) + continue; + + xe_gt_assert(gt, gt->default_lrc[class]); + + real_size = xe_lrc_size(xe, class); + alloc_size = PAGE_ALIGN(real_size); + total_size += alloc_size; + + /* + * This interface is slightly confusing. We need to pass the + * base address of the full golden context and the size of just + * the engine state, which is the section of the context image + * that starts after the execlists LRC registers. This is + * required to allow the GuC to restore just the engine state + * when a watchdog reset occurs. + * We calculate the engine state size by removing the size of + * what comes before it in the context image (which is identical + * on all engines). + */ + ads_blob_write(ads, ads.eng_state_size[guc_class], + real_size - xe_lrc_skip_size(xe)); + ads_blob_write(ads, ads.golden_context_lrca[guc_class], + addr_ggtt); + + xe_map_memcpy_to(xe, ads_to_map(ads), offset, + gt->default_lrc[class], real_size); + + addr_ggtt += alloc_size; + offset += alloc_size; + } + + xe_gt_assert(gt, total_size == ads->golden_lrc_size); +} + +void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads) +{ + guc_populate_golden_lrc(ads); +} diff --git a/drivers/gpu/drm/xe/xe_guc_ads.h b/drivers/gpu/drm/xe/xe_guc_ads.h new file mode 100644 index 000000000000..138ef6267671 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_ads.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_ADS_H_ +#define _XE_GUC_ADS_H_ + +#include "xe_guc_ads_types.h" + +int xe_guc_ads_init(struct xe_guc_ads *ads); +int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads); +void xe_guc_ads_populate(struct xe_guc_ads *ads); +void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads); +void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads); + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_ads_types.h b/drivers/gpu/drm/xe/xe_guc_ads_types.h new file mode 100644 index 000000000000..4afe44bece4b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_ads_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_ADS_TYPES_H_ +#define _XE_GUC_ADS_TYPES_H_ + +#include <linux/types.h> + +struct xe_bo; + +/** + * struct xe_guc_ads - GuC additional data structures (ADS) + */ +struct xe_guc_ads { + /** @bo: XE BO for GuC ads blob */ + struct xe_bo *bo; + /** @golden_lrc_size: golden LRC size */ + size_t golden_lrc_size; + /** @regset_size: size of register set passed to GuC for save/restore */ + u32 regset_size; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c new file mode 100644 index 000000000000..24a33fa36496 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -0,0 +1,1320 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_guc_ct.h" + +#include <linux/bitfield.h> +#include <linux/circ_buf.h> +#include <linux/delay.h> + +#include <drm/drm_managed.h> + +#include "abi/guc_actions_abi.h" +#include "abi/guc_klvs_abi.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_gt_pagefault.h" +#include "xe_gt_tlb_invalidation.h" +#include "xe_guc.h" +#include "xe_guc_submit.h" +#include "xe_map.h" +#include "xe_pm.h" +#include "xe_trace.h" + +/* Used when a CT send wants to block and / or receive data */ +struct g2h_fence { + u32 *response_buffer; + u32 seqno; + u16 response_len; + u16 error; + u16 hint; + u16 reason; + bool retry; + bool fail; + bool done; +}; + +static void g2h_fence_init(struct g2h_fence *g2h_fence, u32 *response_buffer) +{ + g2h_fence->response_buffer = response_buffer; + g2h_fence->response_len = 0; + g2h_fence->fail = false; + g2h_fence->retry = false; + g2h_fence->done = false; + g2h_fence->seqno = ~0x0; +} + +static bool g2h_fence_needs_alloc(struct g2h_fence *g2h_fence) +{ + return g2h_fence->seqno == ~0x0; +} + +static struct xe_guc * +ct_to_guc(struct xe_guc_ct *ct) +{ + return container_of(ct, struct xe_guc, ct); +} + +static struct xe_gt * +ct_to_gt(struct xe_guc_ct *ct) +{ + return container_of(ct, struct xe_gt, uc.guc.ct); +} + +static struct xe_device * +ct_to_xe(struct xe_guc_ct *ct) +{ + return gt_to_xe(ct_to_gt(ct)); +} + +/** + * DOC: GuC CTB Blob + * + * We allocate single blob to hold both CTB descriptors and buffers: + * + * +--------+-----------------------------------------------+------+ + * | offset | contents | size | + * +========+===============================================+======+ + * | 0x0000 | H2G CTB Descriptor (send) | | + * +--------+-----------------------------------------------+ 4K | + * | 0x0800 | G2H CTB Descriptor (g2h) | | + * +--------+-----------------------------------------------+------+ + * | 0x1000 | H2G CT Buffer (send) | n*4K | + * | | | | + * +--------+-----------------------------------------------+------+ + * | 0x1000 | G2H CT Buffer (g2h) | m*4K | + * | + n*4K | | | + * +--------+-----------------------------------------------+------+ + * + * Size of each ``CT Buffer`` must be multiple of 4K. + * We don't expect too many messages in flight at any time, unless we are + * using the GuC submission. In that case each request requires a minimum + * 2 dwords which gives us a maximum 256 queue'd requests. Hopefully this + * enough space to avoid backpressure on the driver. We increase the size + * of the receive buffer (relative to the send) to ensure a G2H response + * CTB has a landing spot. + */ + +#define CTB_DESC_SIZE ALIGN(sizeof(struct guc_ct_buffer_desc), SZ_2K) +#define CTB_H2G_BUFFER_SIZE (SZ_4K) +#define CTB_G2H_BUFFER_SIZE (4 * CTB_H2G_BUFFER_SIZE) +#define G2H_ROOM_BUFFER_SIZE (CTB_G2H_BUFFER_SIZE / 4) + +static size_t guc_ct_size(void) +{ + return 2 * CTB_DESC_SIZE + CTB_H2G_BUFFER_SIZE + + CTB_G2H_BUFFER_SIZE; +} + +static void guc_ct_fini(struct drm_device *drm, void *arg) +{ + struct xe_guc_ct *ct = arg; + + xa_destroy(&ct->fence_lookup); +} + +static void g2h_worker_func(struct work_struct *w); + +static void primelockdep(struct xe_guc_ct *ct) +{ + if (!IS_ENABLED(CONFIG_LOCKDEP)) + return; + + fs_reclaim_acquire(GFP_KERNEL); + might_lock(&ct->lock); + fs_reclaim_release(GFP_KERNEL); +} + +int xe_guc_ct_init(struct xe_guc_ct *ct) +{ + struct xe_device *xe = ct_to_xe(ct); + struct xe_gt *gt = ct_to_gt(ct); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_bo *bo; + int err; + + xe_assert(xe, !(guc_ct_size() % PAGE_SIZE)); + + drmm_mutex_init(&xe->drm, &ct->lock); + spin_lock_init(&ct->fast_lock); + xa_init(&ct->fence_lookup); + INIT_WORK(&ct->g2h_worker, g2h_worker_func); + init_waitqueue_head(&ct->wq); + init_waitqueue_head(&ct->g2h_fence_wq); + + primelockdep(ct); + + bo = xe_managed_bo_create_pin_map(xe, tile, guc_ct_size(), + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + ct->bo = bo; + + err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct); + if (err) + return err; + + return 0; +} + +#define desc_read(xe_, guc_ctb__, field_) \ + xe_map_rd_field(xe_, &guc_ctb__->desc, 0, \ + struct guc_ct_buffer_desc, field_) + +#define desc_write(xe_, guc_ctb__, field_, val_) \ + xe_map_wr_field(xe_, &guc_ctb__->desc, 0, \ + struct guc_ct_buffer_desc, field_, val_) + +static void guc_ct_ctb_h2g_init(struct xe_device *xe, struct guc_ctb *h2g, + struct iosys_map *map) +{ + h2g->info.size = CTB_H2G_BUFFER_SIZE / sizeof(u32); + h2g->info.resv_space = 0; + h2g->info.tail = 0; + h2g->info.head = 0; + h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head, + h2g->info.size) - + h2g->info.resv_space; + h2g->info.broken = false; + + h2g->desc = *map; + xe_map_memset(xe, &h2g->desc, 0, 0, sizeof(struct guc_ct_buffer_desc)); + + h2g->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2); +} + +static void guc_ct_ctb_g2h_init(struct xe_device *xe, struct guc_ctb *g2h, + struct iosys_map *map) +{ + g2h->info.size = CTB_G2H_BUFFER_SIZE / sizeof(u32); + g2h->info.resv_space = G2H_ROOM_BUFFER_SIZE / sizeof(u32); + g2h->info.head = 0; + g2h->info.tail = 0; + g2h->info.space = CIRC_SPACE(g2h->info.tail, g2h->info.head, + g2h->info.size) - + g2h->info.resv_space; + g2h->info.broken = false; + + g2h->desc = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE); + xe_map_memset(xe, &g2h->desc, 0, 0, sizeof(struct guc_ct_buffer_desc)); + + g2h->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2 + + CTB_H2G_BUFFER_SIZE); +} + +static int guc_ct_ctb_h2g_register(struct xe_guc_ct *ct) +{ + struct xe_guc *guc = ct_to_guc(ct); + u32 desc_addr, ctb_addr, size; + int err; + + desc_addr = xe_bo_ggtt_addr(ct->bo); + ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2; + size = ct->ctbs.h2g.info.size * sizeof(u32); + + err = xe_guc_self_cfg64(guc, + GUC_KLV_SELF_CFG_H2G_CTB_DESCRIPTOR_ADDR_KEY, + desc_addr); + if (err) + return err; + + err = xe_guc_self_cfg64(guc, + GUC_KLV_SELF_CFG_H2G_CTB_ADDR_KEY, + ctb_addr); + if (err) + return err; + + return xe_guc_self_cfg32(guc, + GUC_KLV_SELF_CFG_H2G_CTB_SIZE_KEY, + size); +} + +static int guc_ct_ctb_g2h_register(struct xe_guc_ct *ct) +{ + struct xe_guc *guc = ct_to_guc(ct); + u32 desc_addr, ctb_addr, size; + int err; + + desc_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE; + ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2 + + CTB_H2G_BUFFER_SIZE; + size = ct->ctbs.g2h.info.size * sizeof(u32); + + err = xe_guc_self_cfg64(guc, + GUC_KLV_SELF_CFG_G2H_CTB_DESCRIPTOR_ADDR_KEY, + desc_addr); + if (err) + return err; + + err = xe_guc_self_cfg64(guc, + GUC_KLV_SELF_CFG_G2H_CTB_ADDR_KEY, + ctb_addr); + if (err) + return err; + + return xe_guc_self_cfg32(guc, + GUC_KLV_SELF_CFG_G2H_CTB_SIZE_KEY, + size); +} + +static int guc_ct_control_toggle(struct xe_guc_ct *ct, bool enable) +{ + u32 request[HOST2GUC_CONTROL_CTB_REQUEST_MSG_LEN] = { + FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) | + FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) | + FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, + GUC_ACTION_HOST2GUC_CONTROL_CTB), + FIELD_PREP(HOST2GUC_CONTROL_CTB_REQUEST_MSG_1_CONTROL, + enable ? GUC_CTB_CONTROL_ENABLE : + GUC_CTB_CONTROL_DISABLE), + }; + int ret = xe_guc_mmio_send(ct_to_guc(ct), request, ARRAY_SIZE(request)); + + return ret > 0 ? -EPROTO : ret; +} + +int xe_guc_ct_enable(struct xe_guc_ct *ct) +{ + struct xe_device *xe = ct_to_xe(ct); + int err; + + xe_assert(xe, !ct->enabled); + + guc_ct_ctb_h2g_init(xe, &ct->ctbs.h2g, &ct->bo->vmap); + guc_ct_ctb_g2h_init(xe, &ct->ctbs.g2h, &ct->bo->vmap); + + err = guc_ct_ctb_h2g_register(ct); + if (err) + goto err_out; + + err = guc_ct_ctb_g2h_register(ct); + if (err) + goto err_out; + + err = guc_ct_control_toggle(ct, true); + if (err) + goto err_out; + + mutex_lock(&ct->lock); + spin_lock_irq(&ct->fast_lock); + ct->g2h_outstanding = 0; + ct->enabled = true; + spin_unlock_irq(&ct->fast_lock); + mutex_unlock(&ct->lock); + + smp_mb(); + wake_up_all(&ct->wq); + drm_dbg(&xe->drm, "GuC CT communication channel enabled\n"); + + return 0; + +err_out: + drm_err(&xe->drm, "Failed to enable CT (%d)\n", err); + + return err; +} + +void xe_guc_ct_disable(struct xe_guc_ct *ct) +{ + mutex_lock(&ct->lock); /* Serialise dequeue_one_g2h() */ + spin_lock_irq(&ct->fast_lock); /* Serialise CT fast-path */ + ct->enabled = false; /* Finally disable CT communication */ + spin_unlock_irq(&ct->fast_lock); + mutex_unlock(&ct->lock); + + xa_destroy(&ct->fence_lookup); +} + +static bool h2g_has_room(struct xe_guc_ct *ct, u32 cmd_len) +{ + struct guc_ctb *h2g = &ct->ctbs.h2g; + + lockdep_assert_held(&ct->lock); + + if (cmd_len > h2g->info.space) { + h2g->info.head = desc_read(ct_to_xe(ct), h2g, head); + h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head, + h2g->info.size) - + h2g->info.resv_space; + if (cmd_len > h2g->info.space) + return false; + } + + return true; +} + +static bool g2h_has_room(struct xe_guc_ct *ct, u32 g2h_len) +{ + if (!g2h_len) + return true; + + lockdep_assert_held(&ct->fast_lock); + + return ct->ctbs.g2h.info.space > g2h_len; +} + +static int has_room(struct xe_guc_ct *ct, u32 cmd_len, u32 g2h_len) +{ + lockdep_assert_held(&ct->lock); + + if (!g2h_has_room(ct, g2h_len) || !h2g_has_room(ct, cmd_len)) + return -EBUSY; + + return 0; +} + +static void h2g_reserve_space(struct xe_guc_ct *ct, u32 cmd_len) +{ + lockdep_assert_held(&ct->lock); + ct->ctbs.h2g.info.space -= cmd_len; +} + +static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h) +{ + xe_assert(ct_to_xe(ct), g2h_len <= ct->ctbs.g2h.info.space); + + if (g2h_len) { + lockdep_assert_held(&ct->fast_lock); + + ct->ctbs.g2h.info.space -= g2h_len; + ct->g2h_outstanding += num_g2h; + } +} + +static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) +{ + lockdep_assert_held(&ct->fast_lock); + xe_assert(ct_to_xe(ct), ct->ctbs.g2h.info.space + g2h_len <= + ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space); + + ct->ctbs.g2h.info.space += g2h_len; + --ct->g2h_outstanding; +} + +static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) +{ + spin_lock_irq(&ct->fast_lock); + __g2h_release_space(ct, g2h_len); + spin_unlock_irq(&ct->fast_lock); +} + +#define H2G_CT_HEADERS (GUC_CTB_HDR_LEN + 1) /* one DW CTB header and one DW HxG header */ + +static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 ct_fence_value, bool want_response) +{ + struct xe_device *xe = ct_to_xe(ct); + struct guc_ctb *h2g = &ct->ctbs.h2g; + u32 cmd[H2G_CT_HEADERS]; + u32 tail = h2g->info.tail; + u32 full_len; + struct iosys_map map = IOSYS_MAP_INIT_OFFSET(&h2g->cmds, + tail * sizeof(u32)); + + full_len = len + GUC_CTB_HDR_LEN; + + lockdep_assert_held(&ct->lock); + xe_assert(xe, full_len <= GUC_CTB_MSG_MAX_LEN); + xe_assert(xe, tail <= h2g->info.size); + + /* Command will wrap, zero fill (NOPs), return and check credits again */ + if (tail + full_len > h2g->info.size) { + xe_map_memset(xe, &map, 0, 0, + (h2g->info.size - tail) * sizeof(u32)); + h2g_reserve_space(ct, (h2g->info.size - tail)); + h2g->info.tail = 0; + desc_write(xe, h2g, tail, h2g->info.tail); + + return -EAGAIN; + } + + /* + * dw0: CT header (including fence) + * dw1: HXG header (including action code) + * dw2+: action data + */ + cmd[0] = FIELD_PREP(GUC_CTB_MSG_0_FORMAT, GUC_CTB_FORMAT_HXG) | + FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) | + FIELD_PREP(GUC_CTB_MSG_0_FENCE, ct_fence_value); + if (want_response) { + cmd[1] = + FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) | + FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION | + GUC_HXG_EVENT_MSG_0_DATA0, action[0]); + } else { + cmd[1] = + FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_EVENT) | + FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION | + GUC_HXG_EVENT_MSG_0_DATA0, action[0]); + } + + /* H2G header in cmd[1] replaces action[0] so: */ + --len; + ++action; + + /* Write H2G ensuring visable before descriptor update */ + xe_map_memcpy_to(xe, &map, 0, cmd, H2G_CT_HEADERS * sizeof(u32)); + xe_map_memcpy_to(xe, &map, H2G_CT_HEADERS * sizeof(u32), action, len * sizeof(u32)); + xe_device_wmb(xe); + + /* Update local copies */ + h2g->info.tail = (tail + full_len) % h2g->info.size; + h2g_reserve_space(ct, full_len); + + /* Update descriptor */ + desc_write(xe, h2g, tail, h2g->info.tail); + + trace_xe_guc_ctb_h2g(ct_to_gt(ct)->info.id, *(action - 1), full_len, + desc_read(xe, h2g, head), h2g->info.tail); + + return 0; +} + +static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, + u32 len, u32 g2h_len, u32 num_g2h, + struct g2h_fence *g2h_fence) +{ + struct xe_device *xe = ct_to_xe(ct); + int ret; + + xe_assert(xe, !g2h_len || !g2h_fence); + xe_assert(xe, !num_g2h || !g2h_fence); + xe_assert(xe, !g2h_len || num_g2h); + xe_assert(xe, g2h_len || !num_g2h); + lockdep_assert_held(&ct->lock); + + if (unlikely(ct->ctbs.h2g.info.broken)) { + ret = -EPIPE; + goto out; + } + + if (unlikely(!ct->enabled)) { + ret = -ENODEV; + goto out; + } + + if (g2h_fence) { + g2h_len = GUC_CTB_HXG_MSG_MAX_LEN; + num_g2h = 1; + + if (g2h_fence_needs_alloc(g2h_fence)) { + void *ptr; + + g2h_fence->seqno = (ct->fence_seqno++ & 0xffff); + ptr = xa_store(&ct->fence_lookup, + g2h_fence->seqno, + g2h_fence, GFP_ATOMIC); + if (IS_ERR(ptr)) { + ret = PTR_ERR(ptr); + goto out; + } + } + } + + if (g2h_len) + spin_lock_irq(&ct->fast_lock); +retry: + ret = has_room(ct, len + GUC_CTB_HDR_LEN, g2h_len); + if (unlikely(ret)) + goto out_unlock; + + ret = h2g_write(ct, action, len, g2h_fence ? g2h_fence->seqno : 0, + !!g2h_fence); + if (unlikely(ret)) { + if (ret == -EAGAIN) + goto retry; + goto out_unlock; + } + + __g2h_reserve_space(ct, g2h_len, num_g2h); + xe_guc_notify(ct_to_guc(ct)); +out_unlock: + if (g2h_len) + spin_unlock_irq(&ct->fast_lock); +out: + return ret; +} + +static void kick_reset(struct xe_guc_ct *ct) +{ + xe_gt_reset_async(ct_to_gt(ct)); +} + +static int dequeue_one_g2h(struct xe_guc_ct *ct); + +static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 g2h_len, u32 num_g2h, + struct g2h_fence *g2h_fence) +{ + struct drm_device *drm = &ct_to_xe(ct)->drm; + struct drm_printer p = drm_info_printer(drm->dev); + unsigned int sleep_period_ms = 1; + int ret; + + xe_assert(ct_to_xe(ct), !g2h_len || !g2h_fence); + lockdep_assert_held(&ct->lock); + xe_device_assert_mem_access(ct_to_xe(ct)); + +try_again: + ret = __guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, + g2h_fence); + + /* + * We wait to try to restore credits for about 1 second before bailing. + * In the case of H2G credits we have no choice but just to wait for the + * GuC to consume H2Gs in the channel so we use a wait / sleep loop. In + * the case of G2H we process any G2H in the channel, hopefully freeing + * credits as we consume the G2H messages. + */ + if (unlikely(ret == -EBUSY && + !h2g_has_room(ct, len + GUC_CTB_HDR_LEN))) { + struct guc_ctb *h2g = &ct->ctbs.h2g; + + if (sleep_period_ms == 1024) + goto broken; + + trace_xe_guc_ct_h2g_flow_control(h2g->info.head, h2g->info.tail, + h2g->info.size, + h2g->info.space, + len + GUC_CTB_HDR_LEN); + msleep(sleep_period_ms); + sleep_period_ms <<= 1; + + goto try_again; + } else if (unlikely(ret == -EBUSY)) { + struct xe_device *xe = ct_to_xe(ct); + struct guc_ctb *g2h = &ct->ctbs.g2h; + + trace_xe_guc_ct_g2h_flow_control(g2h->info.head, + desc_read(xe, g2h, tail), + g2h->info.size, + g2h->info.space, + g2h_fence ? + GUC_CTB_HXG_MSG_MAX_LEN : + g2h_len); + +#define g2h_avail(ct) \ + (desc_read(ct_to_xe(ct), (&ct->ctbs.g2h), tail) != ct->ctbs.g2h.info.head) + if (!wait_event_timeout(ct->wq, !ct->g2h_outstanding || + g2h_avail(ct), HZ)) + goto broken; +#undef g2h_avail + + if (dequeue_one_g2h(ct) < 0) + goto broken; + + goto try_again; + } + + return ret; + +broken: + drm_err(drm, "No forward process on H2G, reset required"); + xe_guc_ct_print(ct, &p, true); + ct->ctbs.h2g.info.broken = true; + + return -EDEADLK; +} + +static int guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 g2h_len, u32 num_g2h, struct g2h_fence *g2h_fence) +{ + int ret; + + xe_assert(ct_to_xe(ct), !g2h_len || !g2h_fence); + + mutex_lock(&ct->lock); + ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, g2h_fence); + mutex_unlock(&ct->lock); + + return ret; +} + +int xe_guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 g2h_len, u32 num_g2h) +{ + int ret; + + ret = guc_ct_send(ct, action, len, g2h_len, num_g2h, NULL); + if (ret == -EDEADLK) + kick_reset(ct); + + return ret; +} + +int xe_guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 g2h_len, u32 num_g2h) +{ + int ret; + + ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, NULL); + if (ret == -EDEADLK) + kick_reset(ct); + + return ret; +} + +int xe_guc_ct_send_g2h_handler(struct xe_guc_ct *ct, const u32 *action, u32 len) +{ + int ret; + + lockdep_assert_held(&ct->lock); + + ret = guc_ct_send_locked(ct, action, len, 0, 0, NULL); + if (ret == -EDEADLK) + kick_reset(ct); + + return ret; +} + +/* + * Check if a GT reset is in progress or will occur and if GT reset brought the + * CT back up. Randomly picking 5 seconds for an upper limit to do a GT a reset. + */ +static bool retry_failure(struct xe_guc_ct *ct, int ret) +{ + if (!(ret == -EDEADLK || ret == -EPIPE || ret == -ENODEV)) + return false; + +#define ct_alive(ct) \ + (ct->enabled && !ct->ctbs.h2g.info.broken && !ct->ctbs.g2h.info.broken) + if (!wait_event_interruptible_timeout(ct->wq, ct_alive(ct), HZ * 5)) + return false; +#undef ct_alive + + return true; +} + +static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 *response_buffer, bool no_fail) +{ + struct xe_device *xe = ct_to_xe(ct); + struct g2h_fence g2h_fence; + int ret = 0; + + /* + * We use a fence to implement blocking sends / receiving response data. + * The seqno of the fence is sent in the H2G, returned in the G2H, and + * an xarray is used as storage media with the seqno being to key. + * Fields in the fence hold success, failure, retry status and the + * response data. Safe to allocate on the stack as the xarray is the + * only reference and it cannot be present after this function exits. + */ +retry: + g2h_fence_init(&g2h_fence, response_buffer); +retry_same_fence: + ret = guc_ct_send(ct, action, len, 0, 0, &g2h_fence); + if (unlikely(ret == -ENOMEM)) { + void *ptr; + + /* Retry allocation /w GFP_KERNEL */ + ptr = xa_store(&ct->fence_lookup, + g2h_fence.seqno, + &g2h_fence, GFP_KERNEL); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + goto retry_same_fence; + } else if (unlikely(ret)) { + if (ret == -EDEADLK) + kick_reset(ct); + + if (no_fail && retry_failure(ct, ret)) + goto retry_same_fence; + + if (!g2h_fence_needs_alloc(&g2h_fence)) + xa_erase_irq(&ct->fence_lookup, g2h_fence.seqno); + + return ret; + } + + ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ); + if (!ret) { + drm_err(&xe->drm, "Timed out wait for G2H, fence %u, action %04x", + g2h_fence.seqno, action[0]); + xa_erase_irq(&ct->fence_lookup, g2h_fence.seqno); + return -ETIME; + } + + if (g2h_fence.retry) { + drm_warn(&xe->drm, "Send retry, action 0x%04x, reason %d", + action[0], g2h_fence.reason); + goto retry; + } + if (g2h_fence.fail) { + drm_err(&xe->drm, "Send failed, action 0x%04x, error %d, hint %d", + action[0], g2h_fence.error, g2h_fence.hint); + ret = -EIO; + } + + return ret > 0 ? 0 : ret; +} + +int xe_guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 *response_buffer) +{ + return guc_ct_send_recv(ct, action, len, response_buffer, false); +} + +int xe_guc_ct_send_recv_no_fail(struct xe_guc_ct *ct, const u32 *action, + u32 len, u32 *response_buffer) +{ + return guc_ct_send_recv(ct, action, len, response_buffer, true); +} + +static int parse_g2h_event(struct xe_guc_ct *ct, u32 *msg, u32 len) +{ + u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1]); + + lockdep_assert_held(&ct->lock); + + switch (action) { + case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE: + case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE: + case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE: + case XE_GUC_ACTION_TLB_INVALIDATION_DONE: + g2h_release_space(ct, len); + } + + return 0; +} + +static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) +{ + struct xe_device *xe = ct_to_xe(ct); + u32 response_len = len - GUC_CTB_MSG_MIN_LEN; + u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]); + u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]); + struct g2h_fence *g2h_fence; + + lockdep_assert_held(&ct->lock); + + g2h_fence = xa_erase(&ct->fence_lookup, fence); + if (unlikely(!g2h_fence)) { + /* Don't tear down channel, as send could've timed out */ + drm_warn(&xe->drm, "G2H fence (%u) not found!\n", fence); + g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); + return 0; + } + + xe_assert(xe, fence == g2h_fence->seqno); + + if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) { + g2h_fence->fail = true; + g2h_fence->error = + FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, msg[1]); + g2h_fence->hint = + FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, msg[1]); + } else if (type == GUC_HXG_TYPE_NO_RESPONSE_RETRY) { + g2h_fence->retry = true; + g2h_fence->reason = + FIELD_GET(GUC_HXG_RETRY_MSG_0_REASON, msg[1]); + } else if (g2h_fence->response_buffer) { + g2h_fence->response_len = response_len; + memcpy(g2h_fence->response_buffer, msg + GUC_CTB_MSG_MIN_LEN, + response_len * sizeof(u32)); + } + + g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); + + g2h_fence->done = true; + smp_mb(); + + wake_up_all(&ct->g2h_fence_wq); + + return 0; +} + +static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) +{ + struct xe_device *xe = ct_to_xe(ct); + u32 hxg, origin, type; + int ret; + + lockdep_assert_held(&ct->lock); + + hxg = msg[1]; + + origin = FIELD_GET(GUC_HXG_MSG_0_ORIGIN, hxg); + if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) { + drm_err(&xe->drm, + "G2H channel broken on read, origin=%d, reset required\n", + origin); + ct->ctbs.g2h.info.broken = true; + + return -EPROTO; + } + + type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg); + switch (type) { + case GUC_HXG_TYPE_EVENT: + ret = parse_g2h_event(ct, msg, len); + break; + case GUC_HXG_TYPE_RESPONSE_SUCCESS: + case GUC_HXG_TYPE_RESPONSE_FAILURE: + case GUC_HXG_TYPE_NO_RESPONSE_RETRY: + ret = parse_g2h_response(ct, msg, len); + break; + default: + drm_err(&xe->drm, + "G2H channel broken on read, type=%d, reset required\n", + type); + ct->ctbs.g2h.info.broken = true; + + ret = -EOPNOTSUPP; + } + + return ret; +} + +static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) +{ + struct xe_device *xe = ct_to_xe(ct); + struct xe_guc *guc = ct_to_guc(ct); + u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1]); + u32 *payload = msg + GUC_CTB_HXG_MSG_MIN_LEN; + u32 adj_len = len - GUC_CTB_HXG_MSG_MIN_LEN; + int ret = 0; + + if (FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]) != GUC_HXG_TYPE_EVENT) + return 0; + + switch (action) { + case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE: + ret = xe_guc_sched_done_handler(guc, payload, adj_len); + break; + case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE: + ret = xe_guc_deregister_done_handler(guc, payload, adj_len); + break; + case XE_GUC_ACTION_CONTEXT_RESET_NOTIFICATION: + ret = xe_guc_exec_queue_reset_handler(guc, payload, adj_len); + break; + case XE_GUC_ACTION_ENGINE_FAILURE_NOTIFICATION: + ret = xe_guc_exec_queue_reset_failure_handler(guc, payload, + adj_len); + break; + case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE: + /* Selftest only at the moment */ + break; + case XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION: + case XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE: + /* FIXME: Handle this */ + break; + case XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR: + ret = xe_guc_exec_queue_memory_cat_error_handler(guc, payload, + adj_len); + break; + case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: + ret = xe_guc_pagefault_handler(guc, payload, adj_len); + break; + case XE_GUC_ACTION_TLB_INVALIDATION_DONE: + ret = xe_guc_tlb_invalidation_done_handler(guc, payload, + adj_len); + break; + case XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY: + ret = xe_guc_access_counter_notify_handler(guc, payload, + adj_len); + break; + default: + drm_err(&xe->drm, "unexpected action 0x%04x\n", action); + } + + if (ret) + drm_err(&xe->drm, "action 0x%04x failed processing, ret=%d\n", + action, ret); + + return 0; +} + +static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path) +{ + struct xe_device *xe = ct_to_xe(ct); + struct guc_ctb *g2h = &ct->ctbs.g2h; + u32 tail, head, len; + s32 avail; + u32 action; + + lockdep_assert_held(&ct->fast_lock); + + if (!ct->enabled) + return -ENODEV; + + if (g2h->info.broken) + return -EPIPE; + + /* Calculate DW available to read */ + tail = desc_read(xe, g2h, tail); + avail = tail - g2h->info.head; + if (unlikely(avail == 0)) + return 0; + + if (avail < 0) + avail += g2h->info.size; + + /* Read header */ + xe_map_memcpy_from(xe, msg, &g2h->cmds, sizeof(u32) * g2h->info.head, + sizeof(u32)); + len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) + GUC_CTB_MSG_MIN_LEN; + if (len > avail) { + drm_err(&xe->drm, + "G2H channel broken on read, avail=%d, len=%d, reset required\n", + avail, len); + g2h->info.broken = true; + + return -EPROTO; + } + + head = (g2h->info.head + 1) % g2h->info.size; + avail = len - 1; + + /* Read G2H message */ + if (avail + head > g2h->info.size) { + u32 avail_til_wrap = g2h->info.size - head; + + xe_map_memcpy_from(xe, msg + 1, + &g2h->cmds, sizeof(u32) * head, + avail_til_wrap * sizeof(u32)); + xe_map_memcpy_from(xe, msg + 1 + avail_til_wrap, + &g2h->cmds, 0, + (avail - avail_til_wrap) * sizeof(u32)); + } else { + xe_map_memcpy_from(xe, msg + 1, + &g2h->cmds, sizeof(u32) * head, + avail * sizeof(u32)); + } + + action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1]); + + if (fast_path) { + if (FIELD_GET(GUC_HXG_MSG_0_TYPE, msg[1]) != GUC_HXG_TYPE_EVENT) + return 0; + + switch (action) { + case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: + case XE_GUC_ACTION_TLB_INVALIDATION_DONE: + break; /* Process these in fast-path */ + default: + return 0; + } + } + + /* Update local / descriptor header */ + g2h->info.head = (head + avail) % g2h->info.size; + desc_write(xe, g2h, head, g2h->info.head); + + trace_xe_guc_ctb_g2h(ct_to_gt(ct)->info.id, action, len, + g2h->info.head, tail); + + return len; +} + +static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len) +{ + struct xe_device *xe = ct_to_xe(ct); + struct xe_guc *guc = ct_to_guc(ct); + u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1]); + u32 *payload = msg + GUC_CTB_HXG_MSG_MIN_LEN; + u32 adj_len = len - GUC_CTB_HXG_MSG_MIN_LEN; + int ret = 0; + + switch (action) { + case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: + ret = xe_guc_pagefault_handler(guc, payload, adj_len); + break; + case XE_GUC_ACTION_TLB_INVALIDATION_DONE: + __g2h_release_space(ct, len); + ret = xe_guc_tlb_invalidation_done_handler(guc, payload, + adj_len); + break; + default: + drm_warn(&xe->drm, "NOT_POSSIBLE"); + } + + if (ret) + drm_err(&xe->drm, "action 0x%04x failed processing, ret=%d\n", + action, ret); +} + +/** + * xe_guc_ct_fast_path - process critical G2H in the IRQ handler + * @ct: GuC CT object + * + * Anything related to page faults is critical for performance, process these + * critical G2H in the IRQ. This is safe as these handlers either just wake up + * waiters or queue another worker. + */ +void xe_guc_ct_fast_path(struct xe_guc_ct *ct) +{ + struct xe_device *xe = ct_to_xe(ct); + bool ongoing; + int len; + + ongoing = xe_device_mem_access_get_if_ongoing(ct_to_xe(ct)); + if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL) + return; + + spin_lock(&ct->fast_lock); + do { + len = g2h_read(ct, ct->fast_msg, true); + if (len > 0) + g2h_fast_path(ct, ct->fast_msg, len); + } while (len > 0); + spin_unlock(&ct->fast_lock); + + if (ongoing) + xe_device_mem_access_put(xe); +} + +/* Returns less than zero on error, 0 on done, 1 on more available */ +static int dequeue_one_g2h(struct xe_guc_ct *ct) +{ + int len; + int ret; + + lockdep_assert_held(&ct->lock); + + spin_lock_irq(&ct->fast_lock); + len = g2h_read(ct, ct->msg, false); + spin_unlock_irq(&ct->fast_lock); + if (len <= 0) + return len; + + ret = parse_g2h_msg(ct, ct->msg, len); + if (unlikely(ret < 0)) + return ret; + + ret = process_g2h_msg(ct, ct->msg, len); + if (unlikely(ret < 0)) + return ret; + + return 1; +} + +static void g2h_worker_func(struct work_struct *w) +{ + struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, g2h_worker); + bool ongoing; + int ret; + + /* + * Normal users must always hold mem_access.ref around CT calls. However + * during the runtime pm callbacks we rely on CT to talk to the GuC, but + * at this stage we can't rely on mem_access.ref and even the + * callback_task will be different than current. For such cases we just + * need to ensure we always process the responses from any blocking + * ct_send requests or where we otherwise expect some response when + * initiated from those callbacks (which will need to wait for the below + * dequeue_one_g2h()). The dequeue_one_g2h() will gracefully fail if + * the device has suspended to the point that the CT communication has + * been disabled. + * + * If we are inside the runtime pm callback, we can be the only task + * still issuing CT requests (since that requires having the + * mem_access.ref). It seems like it might in theory be possible to + * receive unsolicited events from the GuC just as we are + * suspending-resuming, but those will currently anyway be lost when + * eventually exiting from suspend, hence no need to wake up the device + * here. If we ever need something stronger than get_if_ongoing() then + * we need to be careful with blocking the pm callbacks from getting CT + * responses, if the worker here is blocked on those callbacks + * completing, creating a deadlock. + */ + ongoing = xe_device_mem_access_get_if_ongoing(ct_to_xe(ct)); + if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL) + return; + + do { + mutex_lock(&ct->lock); + ret = dequeue_one_g2h(ct); + mutex_unlock(&ct->lock); + + if (unlikely(ret == -EPROTO || ret == -EOPNOTSUPP)) { + struct drm_device *drm = &ct_to_xe(ct)->drm; + struct drm_printer p = drm_info_printer(drm->dev); + + xe_guc_ct_print(ct, &p, false); + kick_reset(ct); + } + } while (ret == 1); + + if (ongoing) + xe_device_mem_access_put(ct_to_xe(ct)); +} + +static void guc_ctb_snapshot_capture(struct xe_device *xe, struct guc_ctb *ctb, + struct guc_ctb_snapshot *snapshot, + bool atomic) +{ + u32 head, tail; + + xe_map_memcpy_from(xe, &snapshot->desc, &ctb->desc, 0, + sizeof(struct guc_ct_buffer_desc)); + memcpy(&snapshot->info, &ctb->info, sizeof(struct guc_ctb_info)); + + snapshot->cmds = kmalloc_array(ctb->info.size, sizeof(u32), + atomic ? GFP_ATOMIC : GFP_KERNEL); + + if (!snapshot->cmds) { + drm_err(&xe->drm, "Skipping CTB commands snapshot. Only CTB info will be available.\n"); + return; + } + + head = snapshot->desc.head; + tail = snapshot->desc.tail; + + if (head != tail) { + struct iosys_map map = + IOSYS_MAP_INIT_OFFSET(&ctb->cmds, head * sizeof(u32)); + + while (head != tail) { + snapshot->cmds[head] = xe_map_rd(xe, &map, 0, u32); + ++head; + if (head == ctb->info.size) { + head = 0; + map = ctb->cmds; + } else { + iosys_map_incr(&map, sizeof(u32)); + } + } + } +} + +static void guc_ctb_snapshot_print(struct guc_ctb_snapshot *snapshot, + struct drm_printer *p) +{ + u32 head, tail; + + drm_printf(p, "\tsize: %d\n", snapshot->info.size); + drm_printf(p, "\tresv_space: %d\n", snapshot->info.resv_space); + drm_printf(p, "\thead: %d\n", snapshot->info.head); + drm_printf(p, "\ttail: %d\n", snapshot->info.tail); + drm_printf(p, "\tspace: %d\n", snapshot->info.space); + drm_printf(p, "\tbroken: %d\n", snapshot->info.broken); + drm_printf(p, "\thead (memory): %d\n", snapshot->desc.head); + drm_printf(p, "\ttail (memory): %d\n", snapshot->desc.tail); + drm_printf(p, "\tstatus (memory): 0x%x\n", snapshot->desc.status); + + if (!snapshot->cmds) + return; + + head = snapshot->desc.head; + tail = snapshot->desc.tail; + + while (head != tail) { + drm_printf(p, "\tcmd[%d]: 0x%08x\n", head, + snapshot->cmds[head]); + ++head; + if (head == snapshot->info.size) + head = 0; + } +} + +static void guc_ctb_snapshot_free(struct guc_ctb_snapshot *snapshot) +{ + kfree(snapshot->cmds); +} + +/** + * xe_guc_ct_snapshot_capture - Take a quick snapshot of the CT state. + * @ct: GuC CT object. + * @atomic: Boolean to indicate if this is called from atomic context like + * reset or CTB handler or from some regular path like debugfs. + * + * This can be printed out in a later stage like during dev_coredump + * analysis. + * + * Returns: a GuC CT snapshot object that must be freed by the caller + * by using `xe_guc_ct_snapshot_free`. + */ +struct xe_guc_ct_snapshot *xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct, + bool atomic) +{ + struct xe_device *xe = ct_to_xe(ct); + struct xe_guc_ct_snapshot *snapshot; + + snapshot = kzalloc(sizeof(*snapshot), + atomic ? GFP_ATOMIC : GFP_KERNEL); + + if (!snapshot) { + drm_err(&xe->drm, "Skipping CTB snapshot entirely.\n"); + return NULL; + } + + if (ct->enabled) { + snapshot->ct_enabled = true; + snapshot->g2h_outstanding = READ_ONCE(ct->g2h_outstanding); + guc_ctb_snapshot_capture(xe, &ct->ctbs.h2g, + &snapshot->h2g, atomic); + guc_ctb_snapshot_capture(xe, &ct->ctbs.g2h, + &snapshot->g2h, atomic); + } + + return snapshot; +} + +/** + * xe_guc_ct_snapshot_print - Print out a given GuC CT snapshot. + * @snapshot: GuC CT snapshot object. + * @p: drm_printer where it will be printed out. + * + * This function prints out a given GuC CT snapshot object. + */ +void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, + struct drm_printer *p) +{ + if (!snapshot) + return; + + if (snapshot->ct_enabled) { + drm_puts(p, "\nH2G CTB (all sizes in DW):\n"); + guc_ctb_snapshot_print(&snapshot->h2g, p); + + drm_puts(p, "\nG2H CTB (all sizes in DW):\n"); + guc_ctb_snapshot_print(&snapshot->g2h, p); + + drm_printf(p, "\tg2h outstanding: %d\n", + snapshot->g2h_outstanding); + } else { + drm_puts(p, "\nCT disabled\n"); + } +} + +/** + * xe_guc_ct_snapshot_free - Free all allocated objects for a given snapshot. + * @snapshot: GuC CT snapshot object. + * + * This function free all the memory that needed to be allocated at capture + * time. + */ +void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot) +{ + if (!snapshot) + return; + + guc_ctb_snapshot_free(&snapshot->h2g); + guc_ctb_snapshot_free(&snapshot->g2h); + kfree(snapshot); +} + +/** + * xe_guc_ct_print - GuC CT Print. + * @ct: GuC CT. + * @p: drm_printer where it will be printed out. + * @atomic: Boolean to indicate if this is called from atomic context like + * reset or CTB handler or from some regular path like debugfs. + * + * This function quickly capture a snapshot and immediately print it out. + */ +void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool atomic) +{ + struct xe_guc_ct_snapshot *snapshot; + + snapshot = xe_guc_ct_snapshot_capture(ct, atomic); + xe_guc_ct_snapshot_print(snapshot, p); + xe_guc_ct_snapshot_free(snapshot); +} diff --git a/drivers/gpu/drm/xe/xe_guc_ct.h b/drivers/gpu/drm/xe/xe_guc_ct.h new file mode 100644 index 000000000000..f15f8a4857e0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_ct.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_CT_H_ +#define _XE_GUC_CT_H_ + +#include "xe_guc_ct_types.h" + +struct drm_printer; + +int xe_guc_ct_init(struct xe_guc_ct *ct); +int xe_guc_ct_enable(struct xe_guc_ct *ct); +void xe_guc_ct_disable(struct xe_guc_ct *ct); +void xe_guc_ct_fast_path(struct xe_guc_ct *ct); + +struct xe_guc_ct_snapshot * +xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct, bool atomic); +void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, + struct drm_printer *p); +void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot); +void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool atomic); + +static inline void xe_guc_ct_irq_handler(struct xe_guc_ct *ct) +{ + wake_up_all(&ct->wq); + if (ct->enabled) + queue_work(system_unbound_wq, &ct->g2h_worker); + xe_guc_ct_fast_path(ct); +} + +/* Basic CT send / receives */ +int xe_guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 g2h_len, u32 num_g2h); +int xe_guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 g2h_len, u32 num_g2h); +int xe_guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, + u32 *response_buffer); +static inline int +xe_guc_ct_send_block(struct xe_guc_ct *ct, const u32 *action, u32 len) +{ + return xe_guc_ct_send_recv(ct, action, len, NULL); +} + +/* This is only version of the send CT you can call from a G2H handler */ +int xe_guc_ct_send_g2h_handler(struct xe_guc_ct *ct, const u32 *action, + u32 len); + +/* Can't fail because a GT reset is in progress */ +int xe_guc_ct_send_recv_no_fail(struct xe_guc_ct *ct, const u32 *action, + u32 len, u32 *response_buffer); +static inline int +xe_guc_ct_send_block_no_fail(struct xe_guc_ct *ct, const u32 *action, u32 len) +{ + return xe_guc_ct_send_recv_no_fail(ct, action, len, NULL); +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_ct_types.h b/drivers/gpu/drm/xe/xe_guc_ct_types.h new file mode 100644 index 000000000000..d814d4ee3fc6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_ct_types.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_CT_TYPES_H_ +#define _XE_GUC_CT_TYPES_H_ + +#include <linux/interrupt.h> +#include <linux/iosys-map.h> +#include <linux/spinlock_types.h> +#include <linux/wait.h> +#include <linux/xarray.h> + +#include "abi/guc_communication_ctb_abi.h" + +struct xe_bo; + +/** + * struct guc_ctb_info - GuC command transport buffer (CTB) info + */ +struct guc_ctb_info { + /** @size: size of CTB commands (DW) */ + u32 size; + /** @resv_space: reserved space of CTB commands (DW) */ + u32 resv_space; + /** @head: head of CTB commands (DW) */ + u32 head; + /** @tail: tail of CTB commands (DW) */ + u32 tail; + /** @space: space in CTB commands (DW) */ + u32 space; + /** @broken: channel broken */ + bool broken; +}; + +/** + * struct guc_ctb - GuC command transport buffer (CTB) + */ +struct guc_ctb { + /** @desc: dma buffer map for CTB descriptor */ + struct iosys_map desc; + /** @cmds: dma buffer map for CTB commands */ + struct iosys_map cmds; + /** @info: CTB info */ + struct guc_ctb_info info; +}; + +/** + * struct guc_ctb_snapshot - GuC command transport buffer (CTB) snapshot + */ +struct guc_ctb_snapshot { + /** @desc: snapshot of the CTB descriptor */ + struct guc_ct_buffer_desc desc; + /** @cmds: snapshot of the CTB commands */ + u32 *cmds; + /** @info: snapshot of the CTB info */ + struct guc_ctb_info info; +}; + +/** + * struct xe_guc_ct_snapshot - GuC command transport (CT) snapshot + */ +struct xe_guc_ct_snapshot { + /** @ct_enabled: CT enabled info at capture time. */ + bool ct_enabled; + /** @g2h_outstanding: G2H outstanding info at the capture time */ + u32 g2h_outstanding; + /** @g2h: G2H CTB snapshot */ + struct guc_ctb_snapshot g2h; + /** @h2g: H2G CTB snapshot */ + struct guc_ctb_snapshot h2g; +}; + +/** + * struct xe_guc_ct - GuC command transport (CT) layer + * + * Includes a pair of CT buffers for bi-directional communication and tracking + * for the H2G and G2H requests sent and received through the buffers. + */ +struct xe_guc_ct { + /** @bo: XE BO for CT */ + struct xe_bo *bo; + /** @lock: protects everything in CT layer */ + struct mutex lock; + /** @fast_lock: protects G2H channel and credits */ + spinlock_t fast_lock; + /** @ctbs: buffers for sending and receiving commands */ + struct { + /** @send: Host to GuC (H2G, send) channel */ + struct guc_ctb h2g; + /** @recv: GuC to Host (G2H, receive) channel */ + struct guc_ctb g2h; + } ctbs; + /** @g2h_outstanding: number of outstanding G2H */ + u32 g2h_outstanding; + /** @g2h_worker: worker to process G2H messages */ + struct work_struct g2h_worker; + /** @enabled: CT enabled */ + bool enabled; + /** @fence_seqno: G2H fence seqno - 16 bits used by CT */ + u32 fence_seqno; + /** @fence_lookup: G2H fence lookup */ + struct xarray fence_lookup; + /** @wq: wait queue used for reliable CT sends and freeing G2H credits */ + wait_queue_head_t wq; + /** @g2h_fence_wq: wait queue used for G2H fencing */ + wait_queue_head_t g2h_fence_wq; + /** @msg: Message buffer */ + u32 msg[GUC_CTB_MSG_MAX_LEN]; + /** @fast_msg: Message buffer */ + u32 fast_msg[GUC_CTB_MSG_MAX_LEN]; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_debugfs.c b/drivers/gpu/drm/xe/xe_guc_debugfs.c new file mode 100644 index 000000000000..ffd7d53bcc42 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_debugfs.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_guc_debugfs.h" + +#include <drm/drm_debugfs.h> +#include <drm/drm_managed.h> + +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_guc.h" +#include "xe_guc_ct.h" +#include "xe_guc_log.h" +#include "xe_macros.h" + +static struct xe_guc *node_to_guc(struct drm_info_node *node) +{ + return node->info_ent->data; +} + +static int guc_info(struct seq_file *m, void *data) +{ + struct xe_guc *guc = node_to_guc(m->private); + struct xe_device *xe = guc_to_xe(guc); + struct drm_printer p = drm_seq_file_printer(m); + + xe_device_mem_access_get(xe); + xe_guc_print_info(guc, &p); + xe_device_mem_access_put(xe); + + return 0; +} + +static int guc_log(struct seq_file *m, void *data) +{ + struct xe_guc *guc = node_to_guc(m->private); + struct xe_device *xe = guc_to_xe(guc); + struct drm_printer p = drm_seq_file_printer(m); + + xe_device_mem_access_get(xe); + xe_guc_log_print(&guc->log, &p); + xe_device_mem_access_put(xe); + + return 0; +} + +static const struct drm_info_list debugfs_list[] = { + {"guc_info", guc_info, 0}, + {"guc_log", guc_log, 0}, +}; + +void xe_guc_debugfs_register(struct xe_guc *guc, struct dentry *parent) +{ + struct drm_minor *minor = guc_to_xe(guc)->drm.primary; + struct drm_info_list *local; + int i; + +#define DEBUGFS_SIZE (ARRAY_SIZE(debugfs_list) * sizeof(struct drm_info_list)) + local = drmm_kmalloc(&guc_to_xe(guc)->drm, DEBUGFS_SIZE, GFP_KERNEL); + if (!local) + return; + + memcpy(local, debugfs_list, DEBUGFS_SIZE); +#undef DEBUGFS_SIZE + + for (i = 0; i < ARRAY_SIZE(debugfs_list); ++i) + local[i].data = guc; + + drm_debugfs_create_files(local, + ARRAY_SIZE(debugfs_list), + parent, minor); +} diff --git a/drivers/gpu/drm/xe/xe_guc_debugfs.h b/drivers/gpu/drm/xe/xe_guc_debugfs.h new file mode 100644 index 000000000000..4756dff26fca --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_debugfs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_DEBUGFS_H_ +#define _XE_GUC_DEBUGFS_H_ + +struct dentry; +struct xe_guc; + +void xe_guc_debugfs_register(struct xe_guc *guc, struct dentry *parent); + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h new file mode 100644 index 000000000000..4c39f01e4f52 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_exec_queue_types.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_ENGINE_TYPES_H_ +#define _XE_GUC_ENGINE_TYPES_H_ + +#include <linux/spinlock.h> +#include <linux/workqueue.h> + +#include "xe_gpu_scheduler_types.h" + +struct dma_fence; +struct xe_exec_queue; + +/** + * struct xe_guc_exec_queue - GuC specific state for an xe_exec_queue + */ +struct xe_guc_exec_queue { + /** @q: Backpointer to parent xe_exec_queue */ + struct xe_exec_queue *q; + /** @sched: GPU scheduler for this xe_exec_queue */ + struct xe_gpu_scheduler sched; + /** @entity: Scheduler entity for this xe_exec_queue */ + struct xe_sched_entity entity; + /** + * @static_msgs: Static messages for this xe_exec_queue, used when + * a message needs to sent through the GPU scheduler but memory + * allocations are not allowed. + */ +#define MAX_STATIC_MSG_TYPE 3 + struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE]; + /** @lr_tdr: long running TDR worker */ + struct work_struct lr_tdr; + /** @fini_async: do final fini async from this worker */ + struct work_struct fini_async; + /** @resume_time: time of last resume */ + u64 resume_time; + /** @state: GuC specific state for this xe_exec_queue */ + atomic_t state; + /** @wqi_head: work queue item tail */ + u32 wqi_head; + /** @wqi_tail: work queue item tail */ + u32 wqi_tail; + /** @id: GuC id for this exec_queue */ + u16 id; + /** @suspend_wait: wait queue used to wait on pending suspends */ + wait_queue_head_t suspend_wait; + /** @suspend_pending: a suspend of the exec_queue is pending */ + bool suspend_pending; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_fwif.h b/drivers/gpu/drm/xe/xe_guc_fwif.h new file mode 100644 index 000000000000..4dd5a88a7826 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_fwif.h @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_FWIF_H +#define _XE_GUC_FWIF_H + +#include <linux/bits.h> + +#include "abi/guc_klvs_abi.h" + +#define G2H_LEN_DW_SCHED_CONTEXT_MODE_SET 4 +#define G2H_LEN_DW_DEREGISTER_CONTEXT 3 +#define G2H_LEN_DW_TLB_INVALIDATE 3 + +#define GUC_CONTEXT_DISABLE 0 +#define GUC_CONTEXT_ENABLE 1 + +#define GUC_CLIENT_PRIORITY_KMD_HIGH 0 +#define GUC_CLIENT_PRIORITY_HIGH 1 +#define GUC_CLIENT_PRIORITY_KMD_NORMAL 2 +#define GUC_CLIENT_PRIORITY_NORMAL 3 +#define GUC_CLIENT_PRIORITY_NUM 4 + +#define GUC_RENDER_ENGINE 0 +#define GUC_VIDEO_ENGINE 1 +#define GUC_BLITTER_ENGINE 2 +#define GUC_VIDEOENHANCE_ENGINE 3 +#define GUC_VIDEO_ENGINE2 4 +#define GUC_MAX_ENGINES_NUM (GUC_VIDEO_ENGINE2 + 1) + +#define GUC_RENDER_CLASS 0 +#define GUC_VIDEO_CLASS 1 +#define GUC_VIDEOENHANCE_CLASS 2 +#define GUC_BLITTER_CLASS 3 +#define GUC_COMPUTE_CLASS 4 +#define GUC_GSC_OTHER_CLASS 5 +#define GUC_LAST_ENGINE_CLASS GUC_GSC_OTHER_CLASS +#define GUC_MAX_ENGINE_CLASSES 16 +#define GUC_MAX_INSTANCES_PER_CLASS 32 + +/* Helper for context registration H2G */ +struct guc_ctxt_registration_info { + u32 flags; + u32 context_idx; + u32 engine_class; + u32 engine_submit_mask; + u32 wq_desc_lo; + u32 wq_desc_hi; + u32 wq_base_lo; + u32 wq_base_hi; + u32 wq_size; + u32 hwlrca_lo; + u32 hwlrca_hi; +}; +#define CONTEXT_REGISTRATION_FLAG_KMD BIT(0) + +/* 32-bit KLV structure as used by policy updates and others */ +struct guc_klv_generic_dw_t { + u32 kl; + u32 value; +} __packed; + +/* Format of the UPDATE_CONTEXT_POLICIES H2G data packet */ +struct guc_update_exec_queue_policy_header { + u32 action; + u32 guc_id; +} __packed; + +struct guc_update_exec_queue_policy { + struct guc_update_exec_queue_policy_header header; + struct guc_klv_generic_dw_t klv[GUC_CONTEXT_POLICIES_KLV_NUM_IDS]; +} __packed; + +/* GUC_CTL_* - Parameters for loading the GuC */ +#define GUC_CTL_LOG_PARAMS 0 +#define GUC_LOG_VALID BIT(0) +#define GUC_LOG_NOTIFY_ON_HALF_FULL BIT(1) +#define GUC_LOG_CAPTURE_ALLOC_UNITS BIT(2) +#define GUC_LOG_LOG_ALLOC_UNITS BIT(3) +#define GUC_LOG_CRASH_SHIFT 4 +#define GUC_LOG_CRASH_MASK (0x3 << GUC_LOG_CRASH_SHIFT) +#define GUC_LOG_DEBUG_SHIFT 6 +#define GUC_LOG_DEBUG_MASK (0xF << GUC_LOG_DEBUG_SHIFT) +#define GUC_LOG_CAPTURE_SHIFT 10 +#define GUC_LOG_CAPTURE_MASK (0x3 << GUC_LOG_CAPTURE_SHIFT) +#define GUC_LOG_BUF_ADDR_SHIFT 12 + +#define GUC_CTL_WA 1 +#define GUC_WA_GAM_CREDITS BIT(10) +#define GUC_WA_DUAL_QUEUE BIT(11) +#define GUC_WA_RCS_RESET_BEFORE_RC6 BIT(13) +#define GUC_WA_CONTEXT_ISOLATION BIT(15) +#define GUC_WA_PRE_PARSER BIT(14) +#define GUC_WA_HOLD_CCS_SWITCHOUT BIT(17) +#define GUC_WA_POLLCS BIT(18) +#define GUC_WA_RENDER_RST_RC6_EXIT BIT(19) +#define GUC_WA_RCS_REGS_IN_CCS_REGS_LIST BIT(21) + +#define GUC_CTL_FEATURE 2 +#define GUC_CTL_ENABLE_SLPC BIT(2) +#define GUC_CTL_DISABLE_SCHEDULER BIT(14) + +#define GUC_CTL_DEBUG 3 +#define GUC_LOG_VERBOSITY_SHIFT 0 +#define GUC_LOG_VERBOSITY_LOW (0 << GUC_LOG_VERBOSITY_SHIFT) +#define GUC_LOG_VERBOSITY_MED (1 << GUC_LOG_VERBOSITY_SHIFT) +#define GUC_LOG_VERBOSITY_HIGH (2 << GUC_LOG_VERBOSITY_SHIFT) +#define GUC_LOG_VERBOSITY_ULTRA (3 << GUC_LOG_VERBOSITY_SHIFT) +#define GUC_LOG_VERBOSITY_MIN 0 +#define GUC_LOG_VERBOSITY_MAX 3 +#define GUC_LOG_VERBOSITY_MASK 0x0000000f +#define GUC_LOG_DESTINATION_MASK (3 << 4) +#define GUC_LOG_DISABLED (1 << 6) +#define GUC_PROFILE_ENABLED (1 << 7) + +#define GUC_CTL_ADS 4 +#define GUC_ADS_ADDR_SHIFT 1 +#define GUC_ADS_ADDR_MASK (0xFFFFF << GUC_ADS_ADDR_SHIFT) + +#define GUC_CTL_DEVID 5 + +#define GUC_CTL_MAX_DWORDS 14 + +/* Scheduling policy settings */ + +#define GLOBAL_POLICY_MAX_NUM_WI 15 + +/* Don't reset an engine upon preemption failure */ +#define GLOBAL_POLICY_DISABLE_ENGINE_RESET BIT(0) + +#define GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US 500000 + +struct guc_policies { + u32 submission_queue_depth[GUC_MAX_ENGINE_CLASSES]; + /* + * In micro seconds. How much time to allow before DPC processing is + * called back via interrupt (to prevent DPC queue drain starving). + * Typically 1000s of micro seconds (example only, not granularity). + */ + u32 dpc_promote_time; + + /* Must be set to take these new values. */ + u32 is_valid; + + /* + * Max number of WIs to process per call. A large value may keep CS + * idle. + */ + u32 max_num_work_items; + + u32 global_flags; + u32 reserved[4]; +} __packed; + +/* GuC MMIO reg state struct */ +struct guc_mmio_reg { + u32 offset; + u32 value; + u32 flags; + u32 mask; +#define GUC_REGSET_MASKED BIT(0) +#define GUC_REGSET_MASKED_WITH_VALUE BIT(2) +#define GUC_REGSET_RESTORE_ONLY BIT(3) +} __packed; + +/* GuC register sets */ +struct guc_mmio_reg_set { + u32 address; + u16 count; + u16 reserved; +} __packed; + +/* Generic GT SysInfo data types */ +#define GUC_GENERIC_GT_SYSINFO_SLICE_ENABLED 0 +#define GUC_GENERIC_GT_SYSINFO_VDBOX_SFC_SUPPORT_MASK 1 +#define GUC_GENERIC_GT_SYSINFO_DOORBELL_COUNT_PER_SQIDI 2 +#define GUC_GENERIC_GT_SYSINFO_MAX 16 + +/* HW info */ +struct guc_gt_system_info { + u8 mapping_table[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS]; + u32 engine_enabled_masks[GUC_MAX_ENGINE_CLASSES]; + u32 generic_gt_sysinfo[GUC_GENERIC_GT_SYSINFO_MAX]; +} __packed; + +enum { + GUC_CAPTURE_LIST_INDEX_PF = 0, + GUC_CAPTURE_LIST_INDEX_VF = 1, + GUC_CAPTURE_LIST_INDEX_MAX = 2, +}; + +/* GuC Additional Data Struct */ +struct guc_ads { + struct guc_mmio_reg_set reg_state_list[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS]; + u32 reserved0; + u32 scheduler_policies; + u32 gt_system_info; + u32 reserved1; + u32 control_data; + u32 golden_context_lrca[GUC_MAX_ENGINE_CLASSES]; + u32 eng_state_size[GUC_MAX_ENGINE_CLASSES]; + u32 private_data; + u32 um_init_data; + u32 capture_instance[GUC_CAPTURE_LIST_INDEX_MAX][GUC_MAX_ENGINE_CLASSES]; + u32 capture_class[GUC_CAPTURE_LIST_INDEX_MAX][GUC_MAX_ENGINE_CLASSES]; + u32 capture_global[GUC_CAPTURE_LIST_INDEX_MAX]; + u32 reserved[14]; +} __packed; + +/* Engine usage stats */ +struct guc_engine_usage_record { + u32 current_context_index; + u32 last_switch_in_stamp; + u32 reserved0; + u32 total_runtime; + u32 reserved1[4]; +} __packed; + +struct guc_engine_usage { + struct guc_engine_usage_record engines[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS]; +} __packed; + +/* This action will be programmed in C1BC - SOFT_SCRATCH_15_REG */ +enum xe_guc_recv_message { + XE_GUC_RECV_MSG_CRASH_DUMP_POSTED = BIT(1), + XE_GUC_RECV_MSG_EXCEPTION = BIT(30), +}; + +/* Page fault structures */ +struct access_counter_desc { + u32 dw0; +#define ACCESS_COUNTER_TYPE BIT(0) +#define ACCESS_COUNTER_SUBG_LO GENMASK(31, 1) + + u32 dw1; +#define ACCESS_COUNTER_SUBG_HI BIT(0) +#define ACCESS_COUNTER_RSVD0 GENMASK(2, 1) +#define ACCESS_COUNTER_ENG_INSTANCE GENMASK(8, 3) +#define ACCESS_COUNTER_ENG_CLASS GENMASK(11, 9) +#define ACCESS_COUNTER_ASID GENMASK(31, 12) + + u32 dw2; +#define ACCESS_COUNTER_VFID GENMASK(5, 0) +#define ACCESS_COUNTER_RSVD1 GENMASK(7, 6) +#define ACCESS_COUNTER_GRANULARITY GENMASK(10, 8) +#define ACCESS_COUNTER_RSVD2 GENMASK(16, 11) +#define ACCESS_COUNTER_VIRTUAL_ADDR_RANGE_LO GENMASK(31, 17) + + u32 dw3; +#define ACCESS_COUNTER_VIRTUAL_ADDR_RANGE_HI GENMASK(31, 0) +} __packed; + +enum guc_um_queue_type { + GUC_UM_HW_QUEUE_PAGE_FAULT = 0, + GUC_UM_HW_QUEUE_PAGE_FAULT_RESPONSE, + GUC_UM_HW_QUEUE_ACCESS_COUNTER, + GUC_UM_HW_QUEUE_MAX +}; + +struct guc_um_queue_params { + u64 base_dpa; + u32 base_ggtt_address; + u32 size_in_bytes; + u32 rsvd[4]; +} __packed; + +struct guc_um_init_params { + u64 page_response_timeout_in_us; + u32 rsvd[6]; + struct guc_um_queue_params queue_params[GUC_UM_HW_QUEUE_MAX]; +} __packed; + +enum xe_guc_fault_reply_type { + PFR_ACCESS = 0, + PFR_ENGINE, + PFR_VFID, + PFR_ALL, + PFR_INVALID +}; + +enum xe_guc_response_desc_type { + TLB_INVALIDATION_DESC = 0, + FAULT_RESPONSE_DESC +}; + +struct xe_guc_pagefault_desc { + u32 dw0; +#define PFD_FAULT_LEVEL GENMASK(2, 0) +#define PFD_SRC_ID GENMASK(10, 3) +#define PFD_RSVD_0 GENMASK(17, 11) +#define XE2_PFD_TRVA_FAULT BIT(18) +#define PFD_ENG_INSTANCE GENMASK(24, 19) +#define PFD_ENG_CLASS GENMASK(27, 25) +#define PFD_PDATA_LO GENMASK(31, 28) + + u32 dw1; +#define PFD_PDATA_HI GENMASK(11, 0) +#define PFD_PDATA_HI_SHIFT 4 +#define PFD_ASID GENMASK(31, 12) + + u32 dw2; +#define PFD_ACCESS_TYPE GENMASK(1, 0) +#define PFD_FAULT_TYPE GENMASK(3, 2) +#define PFD_VFID GENMASK(9, 4) +#define PFD_RSVD_1 GENMASK(11, 10) +#define PFD_VIRTUAL_ADDR_LO GENMASK(31, 12) +#define PFD_VIRTUAL_ADDR_LO_SHIFT 12 + + u32 dw3; +#define PFD_VIRTUAL_ADDR_HI GENMASK(31, 0) +#define PFD_VIRTUAL_ADDR_HI_SHIFT 32 +} __packed; + +struct xe_guc_pagefault_reply { + u32 dw0; +#define PFR_VALID BIT(0) +#define PFR_SUCCESS BIT(1) +#define PFR_REPLY GENMASK(4, 2) +#define PFR_RSVD_0 GENMASK(9, 5) +#define PFR_DESC_TYPE GENMASK(11, 10) +#define PFR_ASID GENMASK(31, 12) + + u32 dw1; +#define PFR_VFID GENMASK(5, 0) +#define PFR_RSVD_1 BIT(6) +#define PFR_ENG_INSTANCE GENMASK(12, 7) +#define PFR_ENG_CLASS GENMASK(15, 13) +#define PFR_PDATA GENMASK(31, 16) + + u32 dw2; +#define PFR_RSVD_2 GENMASK(31, 0) +} __packed; + +struct xe_guc_acc_desc { + u32 dw0; +#define ACC_TYPE BIT(0) +#define ACC_TRIGGER 0 +#define ACC_NOTIFY 1 +#define ACC_SUBG_LO GENMASK(31, 1) + + u32 dw1; +#define ACC_SUBG_HI BIT(0) +#define ACC_RSVD0 GENMASK(2, 1) +#define ACC_ENG_INSTANCE GENMASK(8, 3) +#define ACC_ENG_CLASS GENMASK(11, 9) +#define ACC_ASID GENMASK(31, 12) + + u32 dw2; +#define ACC_VFID GENMASK(5, 0) +#define ACC_RSVD1 GENMASK(7, 6) +#define ACC_GRANULARITY GENMASK(10, 8) +#define ACC_RSVD2 GENMASK(16, 11) +#define ACC_VIRTUAL_ADDR_RANGE_LO GENMASK(31, 17) + + u32 dw3; +#define ACC_VIRTUAL_ADDR_RANGE_HI GENMASK(31, 0) +} __packed; + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.c b/drivers/gpu/drm/xe/xe_guc_hwconfig.c new file mode 100644 index 000000000000..2a13a00917f8 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_guc_hwconfig.h" + +#include <drm/drm_managed.h> + +#include "abi/guc_actions_abi.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_guc.h" +#include "xe_map.h" + +static int send_get_hwconfig(struct xe_guc *guc, u32 ggtt_addr, u32 size) +{ + u32 action[] = { + XE_GUC_ACTION_GET_HWCONFIG, + lower_32_bits(ggtt_addr), + upper_32_bits(ggtt_addr), + size, + }; + + return xe_guc_mmio_send(guc, action, ARRAY_SIZE(action)); +} + +static int guc_hwconfig_size(struct xe_guc *guc, u32 *size) +{ + int ret = send_get_hwconfig(guc, 0, 0); + + if (ret < 0) + return ret; + + *size = ret; + return 0; +} + +static int guc_hwconfig_copy(struct xe_guc *guc) +{ + int ret = send_get_hwconfig(guc, xe_bo_ggtt_addr(guc->hwconfig.bo), + guc->hwconfig.size); + + if (ret < 0) + return ret; + + return 0; +} + +int xe_guc_hwconfig_init(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_gt *gt = guc_to_gt(guc); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_bo *bo; + u32 size; + int err; + + /* Initialization already done */ + if (guc->hwconfig.bo) + return 0; + + /* + * All hwconfig the same across GTs so only GT0 needs to be configured + */ + if (gt->info.id != XE_GT0) + return 0; + + /* ADL_P, DG2+ supports hwconfig table */ + if (GRAPHICS_VERx100(xe) < 1255 && xe->info.platform != XE_ALDERLAKE_P) + return 0; + + err = guc_hwconfig_size(guc, &size); + if (err) + return err; + if (!size) + return -EINVAL; + + bo = xe_managed_bo_create_pin_map(xe, tile, PAGE_ALIGN(size), + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + guc->hwconfig.bo = bo; + guc->hwconfig.size = size; + + return guc_hwconfig_copy(guc); +} + +u32 xe_guc_hwconfig_size(struct xe_guc *guc) +{ + return !guc->hwconfig.bo ? 0 : guc->hwconfig.size; +} + +void xe_guc_hwconfig_copy(struct xe_guc *guc, void *dst) +{ + struct xe_device *xe = guc_to_xe(guc); + + XE_WARN_ON(!guc->hwconfig.bo); + + xe_map_memcpy_from(xe, dst, &guc->hwconfig.bo->vmap, 0, + guc->hwconfig.size); +} diff --git a/drivers/gpu/drm/xe/xe_guc_hwconfig.h b/drivers/gpu/drm/xe/xe_guc_hwconfig.h new file mode 100644 index 000000000000..b5794d641900 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_hwconfig.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_HWCONFIG_H_ +#define _XE_GUC_HWCONFIG_H_ + +#include <linux/types.h> + +struct xe_guc; + +int xe_guc_hwconfig_init(struct xe_guc *guc); +u32 xe_guc_hwconfig_size(struct xe_guc *guc); +void xe_guc_hwconfig_copy(struct xe_guc *guc, void *dst); + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c new file mode 100644 index 000000000000..bcd2f4d34081 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_log.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_guc_log.h" + +#include <drm/drm_managed.h> + +#include "xe_bo.h" +#include "xe_gt.h" +#include "xe_map.h" +#include "xe_module.h" + +static struct xe_gt * +log_to_gt(struct xe_guc_log *log) +{ + return container_of(log, struct xe_gt, uc.guc.log); +} + +static struct xe_device * +log_to_xe(struct xe_guc_log *log) +{ + return gt_to_xe(log_to_gt(log)); +} + +static size_t guc_log_size(void) +{ + /* + * GuC Log buffer Layout + * + * +===============================+ 00B + * | Crash dump state header | + * +-------------------------------+ 32B + * | Debug state header | + * +-------------------------------+ 64B + * | Capture state header | + * +-------------------------------+ 96B + * | | + * +===============================+ PAGE_SIZE (4KB) + * | Crash Dump logs | + * +===============================+ + CRASH_SIZE + * | Debug logs | + * +===============================+ + DEBUG_SIZE + * | Capture logs | + * +===============================+ + CAPTURE_SIZE + */ + return PAGE_SIZE + CRASH_BUFFER_SIZE + DEBUG_BUFFER_SIZE + + CAPTURE_BUFFER_SIZE; +} + +void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p) +{ + struct xe_device *xe = log_to_xe(log); + size_t size; + int i, j; + + xe_assert(xe, log->bo); + + size = log->bo->size; + +#define DW_PER_READ 128 + xe_assert(xe, !(size % (DW_PER_READ * sizeof(u32)))); + for (i = 0; i < size / sizeof(u32); i += DW_PER_READ) { + u32 read[DW_PER_READ]; + + xe_map_memcpy_from(xe, read, &log->bo->vmap, i * sizeof(u32), + DW_PER_READ * sizeof(u32)); +#define DW_PER_PRINT 4 + for (j = 0; j < DW_PER_READ / DW_PER_PRINT; ++j) { + u32 *print = read + j * DW_PER_PRINT; + + drm_printf(p, "0x%08x 0x%08x 0x%08x 0x%08x\n", + *(print + 0), *(print + 1), + *(print + 2), *(print + 3)); + } + } +} + +int xe_guc_log_init(struct xe_guc_log *log) +{ + struct xe_device *xe = log_to_xe(log); + struct xe_tile *tile = gt_to_tile(log_to_gt(log)); + struct xe_bo *bo; + + bo = xe_managed_bo_create_pin_map(xe, tile, guc_log_size(), + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + xe_map_memset(xe, &bo->vmap, 0, 0, guc_log_size()); + log->bo = bo; + log->level = xe_modparam.guc_log_level; + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_guc_log.h b/drivers/gpu/drm/xe/xe_guc_log.h new file mode 100644 index 000000000000..2d25ab28b4b3 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_log.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_LOG_H_ +#define _XE_GUC_LOG_H_ + +#include "xe_guc_log_types.h" + +struct drm_printer; + +#if IS_ENABLED(CONFIG_DRM_XE_LARGE_GUC_BUFFER) +#define CRASH_BUFFER_SIZE SZ_1M +#define DEBUG_BUFFER_SIZE SZ_8M +#define CAPTURE_BUFFER_SIZE SZ_2M +#else +#define CRASH_BUFFER_SIZE SZ_8K +#define DEBUG_BUFFER_SIZE SZ_64K +#define CAPTURE_BUFFER_SIZE SZ_16K +#endif +/* + * While we're using plain log level in i915, GuC controls are much more... + * "elaborate"? We have a couple of bits for verbosity, separate bit for actual + * log enabling, and separate bit for default logging - which "conveniently" + * ignores the enable bit. + */ +#define GUC_LOG_LEVEL_DISABLED 0 +#define GUC_LOG_LEVEL_NON_VERBOSE 1 +#define GUC_LOG_LEVEL_IS_ENABLED(x) ((x) > GUC_LOG_LEVEL_DISABLED) +#define GUC_LOG_LEVEL_IS_VERBOSE(x) ((x) > GUC_LOG_LEVEL_NON_VERBOSE) +#define GUC_LOG_LEVEL_TO_VERBOSITY(x) ({ \ + typeof(x) _x = (x); \ + GUC_LOG_LEVEL_IS_VERBOSE(_x) ? _x - 2 : 0; \ +}) +#define GUC_VERBOSITY_TO_LOG_LEVEL(x) ((x) + 2) +#define GUC_LOG_LEVEL_MAX GUC_VERBOSITY_TO_LOG_LEVEL(GUC_LOG_VERBOSITY_MAX) + +int xe_guc_log_init(struct xe_guc_log *log); +void xe_guc_log_print(struct xe_guc_log *log, struct drm_printer *p); + +static inline u32 +xe_guc_log_get_level(struct xe_guc_log *log) +{ + return log->level; +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_log_types.h b/drivers/gpu/drm/xe/xe_guc_log_types.h new file mode 100644 index 000000000000..125080d138a7 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_log_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_LOG_TYPES_H_ +#define _XE_GUC_LOG_TYPES_H_ + +#include <linux/types.h> + +struct xe_bo; + +/** + * struct xe_guc_log - GuC log + */ +struct xe_guc_log { + /** @level: GuC log level */ + u32 level; + /** @bo: XE BO for GuC log */ + struct xe_bo *bo; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c new file mode 100644 index 000000000000..f71085228cb3 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_pc.c @@ -0,0 +1,1000 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_guc_pc.h" + +#include <linux/delay.h> + +#include <drm/drm_managed.h> + +#include "abi/guc_actions_abi.h" +#include "abi/guc_actions_slpc_abi.h" +#include "regs/xe_gt_regs.h" +#include "regs/xe_regs.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_gt_idle.h" +#include "xe_gt_sysfs.h" +#include "xe_gt_types.h" +#include "xe_guc_ct.h" +#include "xe_map.h" +#include "xe_mmio.h" +#include "xe_pcode.h" + +#define MCHBAR_MIRROR_BASE_SNB 0x140000 + +#define RP_STATE_CAP XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x5998) +#define RP0_MASK REG_GENMASK(7, 0) +#define RP1_MASK REG_GENMASK(15, 8) +#define RPN_MASK REG_GENMASK(23, 16) + +#define FREQ_INFO_REC XE_REG(MCHBAR_MIRROR_BASE_SNB + 0x5ef0) +#define RPE_MASK REG_GENMASK(15, 8) + +#define GT_PERF_STATUS XE_REG(0x1381b4) +#define CAGF_MASK REG_GENMASK(19, 11) + +#define GT_FREQUENCY_MULTIPLIER 50 +#define GT_FREQUENCY_SCALER 3 + +/** + * DOC: GuC Power Conservation (PC) + * + * GuC Power Conservation (PC) supports multiple features for the most + * efficient and performing use of the GT when GuC submission is enabled, + * including frequency management, Render-C states management, and various + * algorithms for power balancing. + * + * Single Loop Power Conservation (SLPC) is the name given to the suite of + * connected power conservation features in the GuC firmware. The firmware + * exposes a programming interface to the host for the control of SLPC. + * + * Frequency management: + * ===================== + * + * Xe driver enables SLPC with all of its defaults features and frequency + * selection, which varies per platform. + * + * Render-C States: + * ================ + * + * Render-C states is also a GuC PC feature that is now enabled in Xe for + * all platforms. + * + */ + +static struct xe_guc * +pc_to_guc(struct xe_guc_pc *pc) +{ + return container_of(pc, struct xe_guc, pc); +} + +static struct xe_device * +pc_to_xe(struct xe_guc_pc *pc) +{ + struct xe_guc *guc = pc_to_guc(pc); + struct xe_gt *gt = container_of(guc, struct xe_gt, uc.guc); + + return gt_to_xe(gt); +} + +static struct xe_gt * +pc_to_gt(struct xe_guc_pc *pc) +{ + return container_of(pc, struct xe_gt, uc.guc.pc); +} + +static struct iosys_map * +pc_to_maps(struct xe_guc_pc *pc) +{ + return &pc->bo->vmap; +} + +#define slpc_shared_data_read(pc_, field_) \ + xe_map_rd_field(pc_to_xe(pc_), pc_to_maps(pc_), 0, \ + struct slpc_shared_data, field_) + +#define slpc_shared_data_write(pc_, field_, val_) \ + xe_map_wr_field(pc_to_xe(pc_), pc_to_maps(pc_), 0, \ + struct slpc_shared_data, field_, val_) + +#define SLPC_EVENT(id, count) \ + (FIELD_PREP(HOST2GUC_PC_SLPC_REQUEST_MSG_1_EVENT_ID, id) | \ + FIELD_PREP(HOST2GUC_PC_SLPC_REQUEST_MSG_1_EVENT_ARGC, count)) + +static int wait_for_pc_state(struct xe_guc_pc *pc, + enum slpc_global_state state) +{ + int timeout_us = 5000; /* rought 5ms, but no need for precision */ + int slept, wait = 10; + + xe_device_assert_mem_access(pc_to_xe(pc)); + + for (slept = 0; slept < timeout_us;) { + if (slpc_shared_data_read(pc, header.global_state) == state) + return 0; + + usleep_range(wait, wait << 1); + slept += wait; + wait <<= 1; + if (slept + wait > timeout_us) + wait = timeout_us - slept; + } + + return -ETIMEDOUT; +} + +static int pc_action_reset(struct xe_guc_pc *pc) +{ + struct xe_guc_ct *ct = &pc_to_guc(pc)->ct; + int ret; + u32 action[] = { + GUC_ACTION_HOST2GUC_PC_SLPC_REQUEST, + SLPC_EVENT(SLPC_EVENT_RESET, 2), + xe_bo_ggtt_addr(pc->bo), + 0, + }; + + ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0); + if (ret) + drm_err(&pc_to_xe(pc)->drm, "GuC PC reset: %pe", ERR_PTR(ret)); + + return ret; +} + +static int pc_action_shutdown(struct xe_guc_pc *pc) +{ + struct xe_guc_ct *ct = &pc_to_guc(pc)->ct; + int ret; + u32 action[] = { + GUC_ACTION_HOST2GUC_PC_SLPC_REQUEST, + SLPC_EVENT(SLPC_EVENT_SHUTDOWN, 2), + xe_bo_ggtt_addr(pc->bo), + 0, + }; + + ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0); + if (ret) + drm_err(&pc_to_xe(pc)->drm, "GuC PC shutdown %pe", + ERR_PTR(ret)); + + return ret; +} + +static int pc_action_query_task_state(struct xe_guc_pc *pc) +{ + struct xe_guc_ct *ct = &pc_to_guc(pc)->ct; + int ret; + u32 action[] = { + GUC_ACTION_HOST2GUC_PC_SLPC_REQUEST, + SLPC_EVENT(SLPC_EVENT_QUERY_TASK_STATE, 2), + xe_bo_ggtt_addr(pc->bo), + 0, + }; + + if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING)) + return -EAGAIN; + + /* Blocking here to ensure the results are ready before reading them */ + ret = xe_guc_ct_send_block(ct, action, ARRAY_SIZE(action)); + if (ret) + drm_err(&pc_to_xe(pc)->drm, + "GuC PC query task state failed: %pe", ERR_PTR(ret)); + + return ret; +} + +static int pc_action_set_param(struct xe_guc_pc *pc, u8 id, u32 value) +{ + struct xe_guc_ct *ct = &pc_to_guc(pc)->ct; + int ret; + u32 action[] = { + GUC_ACTION_HOST2GUC_PC_SLPC_REQUEST, + SLPC_EVENT(SLPC_EVENT_PARAMETER_SET, 2), + id, + value, + }; + + if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING)) + return -EAGAIN; + + ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0); + if (ret) + drm_err(&pc_to_xe(pc)->drm, "GuC PC set param failed: %pe", + ERR_PTR(ret)); + + return ret; +} + +static int pc_action_setup_gucrc(struct xe_guc_pc *pc, u32 mode) +{ + struct xe_guc_ct *ct = &pc_to_guc(pc)->ct; + u32 action[] = { + XE_GUC_ACTION_SETUP_PC_GUCRC, + mode, + }; + int ret; + + ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0); + if (ret) + drm_err(&pc_to_xe(pc)->drm, "GuC RC enable failed: %pe", + ERR_PTR(ret)); + return ret; +} + +static u32 decode_freq(u32 raw) +{ + return DIV_ROUND_CLOSEST(raw * GT_FREQUENCY_MULTIPLIER, + GT_FREQUENCY_SCALER); +} + +static u32 encode_freq(u32 freq) +{ + return DIV_ROUND_CLOSEST(freq * GT_FREQUENCY_SCALER, + GT_FREQUENCY_MULTIPLIER); +} + +static u32 pc_get_min_freq(struct xe_guc_pc *pc) +{ + u32 freq; + + freq = FIELD_GET(SLPC_MIN_UNSLICE_FREQ_MASK, + slpc_shared_data_read(pc, task_state_data.freq)); + + return decode_freq(freq); +} + +static void pc_set_manual_rp_ctrl(struct xe_guc_pc *pc, bool enable) +{ + struct xe_gt *gt = pc_to_gt(pc); + u32 state = enable ? RPSWCTL_ENABLE : RPSWCTL_DISABLE; + + /* Allow/Disallow punit to process software freq requests */ + xe_mmio_write32(gt, RP_CONTROL, state); +} + +static void pc_set_cur_freq(struct xe_guc_pc *pc, u32 freq) +{ + struct xe_gt *gt = pc_to_gt(pc); + u32 rpnswreq; + + pc_set_manual_rp_ctrl(pc, true); + + /* Req freq is in units of 16.66 Mhz */ + rpnswreq = REG_FIELD_PREP(REQ_RATIO_MASK, encode_freq(freq)); + xe_mmio_write32(gt, RPNSWREQ, rpnswreq); + + /* Sleep for a small time to allow pcode to respond */ + usleep_range(100, 300); + + pc_set_manual_rp_ctrl(pc, false); +} + +static int pc_set_min_freq(struct xe_guc_pc *pc, u32 freq) +{ + /* + * Let's only check for the rpn-rp0 range. If max < min, + * min becomes a fixed request. + */ + if (freq < pc->rpn_freq || freq > pc->rp0_freq) + return -EINVAL; + + /* + * GuC policy is to elevate minimum frequency to the efficient levels + * Our goal is to have the admin choices respected. + */ + pc_action_set_param(pc, SLPC_PARAM_IGNORE_EFFICIENT_FREQUENCY, + freq < pc->rpe_freq); + + return pc_action_set_param(pc, + SLPC_PARAM_GLOBAL_MIN_GT_UNSLICE_FREQ_MHZ, + freq); +} + +static int pc_get_max_freq(struct xe_guc_pc *pc) +{ + u32 freq; + + freq = FIELD_GET(SLPC_MAX_UNSLICE_FREQ_MASK, + slpc_shared_data_read(pc, task_state_data.freq)); + + return decode_freq(freq); +} + +static int pc_set_max_freq(struct xe_guc_pc *pc, u32 freq) +{ + /* + * Let's only check for the rpn-rp0 range. If max < min, + * min becomes a fixed request. + * Also, overclocking is not supported. + */ + if (freq < pc->rpn_freq || freq > pc->rp0_freq) + return -EINVAL; + + return pc_action_set_param(pc, + SLPC_PARAM_GLOBAL_MAX_GT_UNSLICE_FREQ_MHZ, + freq); +} + +static void mtl_update_rpe_value(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + u32 reg; + + if (xe_gt_is_media_type(gt)) + reg = xe_mmio_read32(gt, MTL_MPE_FREQUENCY); + else + reg = xe_mmio_read32(gt, MTL_GT_RPE_FREQUENCY); + + pc->rpe_freq = decode_freq(REG_FIELD_GET(MTL_RPE_MASK, reg)); +} + +static void tgl_update_rpe_value(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + struct xe_device *xe = gt_to_xe(gt); + u32 reg; + + /* + * For PVC we still need to use fused RP1 as the approximation for RPe + * For other platforms than PVC we get the resolved RPe directly from + * PCODE at a different register + */ + if (xe->info.platform == XE_PVC) + reg = xe_mmio_read32(gt, PVC_RP_STATE_CAP); + else + reg = xe_mmio_read32(gt, FREQ_INFO_REC); + + pc->rpe_freq = REG_FIELD_GET(RPE_MASK, reg) * GT_FREQUENCY_MULTIPLIER; +} + +static void pc_update_rp_values(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + struct xe_device *xe = gt_to_xe(gt); + + if (GRAPHICS_VERx100(xe) >= 1270) + mtl_update_rpe_value(pc); + else + tgl_update_rpe_value(pc); + + /* + * RPe is decided at runtime by PCODE. In the rare case where that's + * smaller than the fused min, we will trust the PCODE and use that + * as our minimum one. + */ + pc->rpn_freq = min(pc->rpn_freq, pc->rpe_freq); +} + +/** + * xe_guc_pc_get_act_freq - Get Actual running frequency + * @pc: The GuC PC + * + * Returns: The Actual running frequency. Which might be 0 if GT is in Render-C sleep state (RC6). + */ +u32 xe_guc_pc_get_act_freq(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + struct xe_device *xe = gt_to_xe(gt); + u32 freq; + + xe_device_mem_access_get(gt_to_xe(gt)); + + /* When in RC6, actual frequency reported will be 0. */ + if (GRAPHICS_VERx100(xe) >= 1270) { + freq = xe_mmio_read32(gt, MTL_MIRROR_TARGET_WP1); + freq = REG_FIELD_GET(MTL_CAGF_MASK, freq); + } else { + freq = xe_mmio_read32(gt, GT_PERF_STATUS); + freq = REG_FIELD_GET(CAGF_MASK, freq); + } + + freq = decode_freq(freq); + + xe_device_mem_access_put(gt_to_xe(gt)); + + return freq; +} + +/** + * xe_guc_pc_get_cur_freq - Get Current requested frequency + * @pc: The GuC PC + * @freq: A pointer to a u32 where the freq value will be returned + * + * Returns: 0 on success, + * -EAGAIN if GuC PC not ready (likely in middle of a reset). + */ +int xe_guc_pc_get_cur_freq(struct xe_guc_pc *pc, u32 *freq) +{ + struct xe_gt *gt = pc_to_gt(pc); + int ret; + + xe_device_mem_access_get(gt_to_xe(gt)); + /* + * GuC SLPC plays with cur freq request when GuCRC is enabled + * Block RC6 for a more reliable read. + */ + ret = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (ret) + goto out; + + *freq = xe_mmio_read32(gt, RPNSWREQ); + + *freq = REG_FIELD_GET(REQ_RATIO_MASK, *freq); + *freq = decode_freq(*freq); + + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); +out: + xe_device_mem_access_put(gt_to_xe(gt)); + return ret; +} + +/** + * xe_guc_pc_get_rp0_freq - Get the RP0 freq + * @pc: The GuC PC + * + * Returns: RP0 freq. + */ +u32 xe_guc_pc_get_rp0_freq(struct xe_guc_pc *pc) +{ + return pc->rp0_freq; +} + +/** + * xe_guc_pc_get_rpe_freq - Get the RPe freq + * @pc: The GuC PC + * + * Returns: RPe freq. + */ +u32 xe_guc_pc_get_rpe_freq(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + struct xe_device *xe = gt_to_xe(gt); + + xe_device_mem_access_get(xe); + pc_update_rp_values(pc); + xe_device_mem_access_put(xe); + + return pc->rpe_freq; +} + +/** + * xe_guc_pc_get_rpn_freq - Get the RPn freq + * @pc: The GuC PC + * + * Returns: RPn freq. + */ +u32 xe_guc_pc_get_rpn_freq(struct xe_guc_pc *pc) +{ + return pc->rpn_freq; +} + +/** + * xe_guc_pc_get_min_freq - Get the min operational frequency + * @pc: The GuC PC + * @freq: A pointer to a u32 where the freq value will be returned + * + * Returns: 0 on success, + * -EAGAIN if GuC PC not ready (likely in middle of a reset). + */ +int xe_guc_pc_get_min_freq(struct xe_guc_pc *pc, u32 *freq) +{ + struct xe_gt *gt = pc_to_gt(pc); + int ret; + + xe_device_mem_access_get(pc_to_xe(pc)); + mutex_lock(&pc->freq_lock); + if (!pc->freq_ready) { + /* Might be in the middle of a gt reset */ + ret = -EAGAIN; + goto out; + } + + /* + * GuC SLPC plays with min freq request when GuCRC is enabled + * Block RC6 for a more reliable read. + */ + ret = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (ret) + goto out; + + ret = pc_action_query_task_state(pc); + if (ret) + goto fw; + + *freq = pc_get_min_freq(pc); + +fw: + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); +out: + mutex_unlock(&pc->freq_lock); + xe_device_mem_access_put(pc_to_xe(pc)); + return ret; +} + +/** + * xe_guc_pc_set_min_freq - Set the minimal operational frequency + * @pc: The GuC PC + * @freq: The selected minimal frequency + * + * Returns: 0 on success, + * -EAGAIN if GuC PC not ready (likely in middle of a reset), + * -EINVAL if value out of bounds. + */ +int xe_guc_pc_set_min_freq(struct xe_guc_pc *pc, u32 freq) +{ + int ret; + + xe_device_mem_access_get(pc_to_xe(pc)); + mutex_lock(&pc->freq_lock); + if (!pc->freq_ready) { + /* Might be in the middle of a gt reset */ + ret = -EAGAIN; + goto out; + } + + ret = pc_set_min_freq(pc, freq); + if (ret) + goto out; + + pc->user_requested_min = freq; + +out: + mutex_unlock(&pc->freq_lock); + xe_device_mem_access_put(pc_to_xe(pc)); + + return ret; +} + +/** + * xe_guc_pc_get_max_freq - Get Maximum operational frequency + * @pc: The GuC PC + * @freq: A pointer to a u32 where the freq value will be returned + * + * Returns: 0 on success, + * -EAGAIN if GuC PC not ready (likely in middle of a reset). + */ +int xe_guc_pc_get_max_freq(struct xe_guc_pc *pc, u32 *freq) +{ + int ret; + + xe_device_mem_access_get(pc_to_xe(pc)); + mutex_lock(&pc->freq_lock); + if (!pc->freq_ready) { + /* Might be in the middle of a gt reset */ + ret = -EAGAIN; + goto out; + } + + ret = pc_action_query_task_state(pc); + if (ret) + goto out; + + *freq = pc_get_max_freq(pc); + +out: + mutex_unlock(&pc->freq_lock); + xe_device_mem_access_put(pc_to_xe(pc)); + return ret; +} + +/** + * xe_guc_pc_set_max_freq - Set the maximum operational frequency + * @pc: The GuC PC + * @freq: The selected maximum frequency value + * + * Returns: 0 on success, + * -EAGAIN if GuC PC not ready (likely in middle of a reset), + * -EINVAL if value out of bounds. + */ +int xe_guc_pc_set_max_freq(struct xe_guc_pc *pc, u32 freq) +{ + int ret; + + xe_device_mem_access_get(pc_to_xe(pc)); + mutex_lock(&pc->freq_lock); + if (!pc->freq_ready) { + /* Might be in the middle of a gt reset */ + ret = -EAGAIN; + goto out; + } + + ret = pc_set_max_freq(pc, freq); + if (ret) + goto out; + + pc->user_requested_max = freq; + +out: + mutex_unlock(&pc->freq_lock); + xe_device_mem_access_put(pc_to_xe(pc)); + return ret; +} + +/** + * xe_guc_pc_c_status - get the current GT C state + * @pc: XE_GuC_PC instance + */ +enum xe_gt_idle_state xe_guc_pc_c_status(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + u32 reg, gt_c_state; + + xe_device_mem_access_get(gt_to_xe(gt)); + + if (GRAPHICS_VERx100(gt_to_xe(gt)) >= 1270) { + reg = xe_mmio_read32(gt, MTL_MIRROR_TARGET_WP1); + gt_c_state = REG_FIELD_GET(MTL_CC_MASK, reg); + } else { + reg = xe_mmio_read32(gt, GT_CORE_STATUS); + gt_c_state = REG_FIELD_GET(RCN_MASK, reg); + } + + xe_device_mem_access_put(gt_to_xe(gt)); + + switch (gt_c_state) { + case GT_C6: + return GT_IDLE_C6; + case GT_C0: + return GT_IDLE_C0; + default: + return GT_IDLE_UNKNOWN; + } +} + +/** + * xe_guc_pc_rc6_residency - rc6 residency counter + * @pc: Xe_GuC_PC instance + */ +u64 xe_guc_pc_rc6_residency(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + u32 reg; + + xe_device_mem_access_get(gt_to_xe(gt)); + reg = xe_mmio_read32(gt, GT_GFX_RC6); + xe_device_mem_access_put(gt_to_xe(gt)); + + return reg; +} + +/** + * xe_guc_pc_mc6_residency - mc6 residency counter + * @pc: Xe_GuC_PC instance + */ +u64 xe_guc_pc_mc6_residency(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + u64 reg; + + xe_device_mem_access_get(gt_to_xe(gt)); + reg = xe_mmio_read32(gt, MTL_MEDIA_MC6); + xe_device_mem_access_put(gt_to_xe(gt)); + + return reg; +} + +static void mtl_init_fused_rp_values(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + u32 reg; + + xe_device_assert_mem_access(pc_to_xe(pc)); + + if (xe_gt_is_media_type(gt)) + reg = xe_mmio_read32(gt, MTL_MEDIAP_STATE_CAP); + else + reg = xe_mmio_read32(gt, MTL_RP_STATE_CAP); + + pc->rp0_freq = decode_freq(REG_FIELD_GET(MTL_RP0_CAP_MASK, reg)); + + pc->rpn_freq = decode_freq(REG_FIELD_GET(MTL_RPN_CAP_MASK, reg)); +} + +static void tgl_init_fused_rp_values(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + struct xe_device *xe = gt_to_xe(gt); + u32 reg; + + xe_device_assert_mem_access(pc_to_xe(pc)); + + if (xe->info.platform == XE_PVC) + reg = xe_mmio_read32(gt, PVC_RP_STATE_CAP); + else + reg = xe_mmio_read32(gt, RP_STATE_CAP); + pc->rp0_freq = REG_FIELD_GET(RP0_MASK, reg) * GT_FREQUENCY_MULTIPLIER; + pc->rpn_freq = REG_FIELD_GET(RPN_MASK, reg) * GT_FREQUENCY_MULTIPLIER; +} + +static void pc_init_fused_rp_values(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + struct xe_device *xe = gt_to_xe(gt); + + if (GRAPHICS_VERx100(xe) >= 1270) + mtl_init_fused_rp_values(pc); + else + tgl_init_fused_rp_values(pc); +} + +/** + * xe_guc_pc_init_early - Initialize RPx values and request a higher GT + * frequency to allow faster GuC load times + * @pc: Xe_GuC_PC instance + */ +void xe_guc_pc_init_early(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + pc_init_fused_rp_values(pc); + pc_set_cur_freq(pc, pc->rp0_freq); +} + +static int pc_adjust_freq_bounds(struct xe_guc_pc *pc) +{ + int ret; + + lockdep_assert_held(&pc->freq_lock); + + ret = pc_action_query_task_state(pc); + if (ret) + return ret; + + /* + * GuC defaults to some RPmax that is not actually achievable without + * overclocking. Let's adjust it to the Hardware RP0, which is the + * regular maximum + */ + if (pc_get_max_freq(pc) > pc->rp0_freq) + pc_set_max_freq(pc, pc->rp0_freq); + + /* + * Same thing happens for Server platforms where min is listed as + * RPMax + */ + if (pc_get_min_freq(pc) > pc->rp0_freq) + pc_set_min_freq(pc, pc->rp0_freq); + + return 0; +} + +static int pc_adjust_requested_freq(struct xe_guc_pc *pc) +{ + int ret = 0; + + lockdep_assert_held(&pc->freq_lock); + + if (pc->user_requested_min != 0) { + ret = pc_set_min_freq(pc, pc->user_requested_min); + if (ret) + return ret; + } + + if (pc->user_requested_max != 0) { + ret = pc_set_max_freq(pc, pc->user_requested_max); + if (ret) + return ret; + } + + return ret; +} + +/** + * xe_guc_pc_gucrc_disable - Disable GuC RC + * @pc: Xe_GuC_PC instance + * + * Disables GuC RC by taking control of RC6 back from GuC. + * + * Return: 0 on success, negative error code on error. + */ +int xe_guc_pc_gucrc_disable(struct xe_guc_pc *pc) +{ + struct xe_device *xe = pc_to_xe(pc); + struct xe_gt *gt = pc_to_gt(pc); + int ret = 0; + + if (xe->info.skip_guc_pc) + return 0; + + xe_device_mem_access_get(pc_to_xe(pc)); + + ret = pc_action_setup_gucrc(pc, XE_GUCRC_HOST_CONTROL); + if (ret) + goto out; + + ret = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (ret) + goto out; + + xe_gt_idle_disable_c6(gt); + + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + +out: + xe_device_mem_access_put(pc_to_xe(pc)); + return ret; +} + +static void pc_init_pcode_freq(struct xe_guc_pc *pc) +{ + u32 min = DIV_ROUND_CLOSEST(pc->rpn_freq, GT_FREQUENCY_MULTIPLIER); + u32 max = DIV_ROUND_CLOSEST(pc->rp0_freq, GT_FREQUENCY_MULTIPLIER); + + XE_WARN_ON(xe_pcode_init_min_freq_table(pc_to_gt(pc), min, max)); +} + +static int pc_init_freqs(struct xe_guc_pc *pc) +{ + int ret; + + mutex_lock(&pc->freq_lock); + + ret = pc_adjust_freq_bounds(pc); + if (ret) + goto out; + + ret = pc_adjust_requested_freq(pc); + if (ret) + goto out; + + pc_update_rp_values(pc); + + pc_init_pcode_freq(pc); + + /* + * The frequencies are really ready for use only after the user + * requested ones got restored. + */ + pc->freq_ready = true; + +out: + mutex_unlock(&pc->freq_lock); + return ret; +} + +/** + * xe_guc_pc_start - Start GuC's Power Conservation component + * @pc: Xe_GuC_PC instance + */ +int xe_guc_pc_start(struct xe_guc_pc *pc) +{ + struct xe_device *xe = pc_to_xe(pc); + struct xe_gt *gt = pc_to_gt(pc); + u32 size = PAGE_ALIGN(sizeof(struct slpc_shared_data)); + int ret; + + xe_gt_assert(gt, xe_device_uc_enabled(xe)); + + xe_device_mem_access_get(pc_to_xe(pc)); + + ret = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + if (ret) + goto out_fail_force_wake; + + if (xe->info.skip_guc_pc) { + if (xe->info.platform != XE_PVC) + xe_gt_idle_enable_c6(gt); + + /* Request max possible since dynamic freq mgmt is not enabled */ + pc_set_cur_freq(pc, UINT_MAX); + + ret = 0; + goto out; + } + + memset(pc->bo->vmap.vaddr, 0, size); + slpc_shared_data_write(pc, header.size, size); + + ret = pc_action_reset(pc); + if (ret) + goto out; + + if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_RUNNING)) { + drm_err(&pc_to_xe(pc)->drm, "GuC PC Start failed\n"); + ret = -EIO; + goto out; + } + + ret = pc_init_freqs(pc); + if (ret) + goto out; + + if (xe->info.platform == XE_PVC) { + xe_guc_pc_gucrc_disable(pc); + ret = 0; + goto out; + } + + ret = pc_action_setup_gucrc(pc, XE_GUCRC_FIRMWARE_CONTROL); + +out: + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); +out_fail_force_wake: + xe_device_mem_access_put(pc_to_xe(pc)); + return ret; +} + +/** + * xe_guc_pc_stop - Stop GuC's Power Conservation component + * @pc: Xe_GuC_PC instance + */ +int xe_guc_pc_stop(struct xe_guc_pc *pc) +{ + struct xe_device *xe = pc_to_xe(pc); + int ret; + + xe_device_mem_access_get(pc_to_xe(pc)); + + if (xe->info.skip_guc_pc) { + xe_gt_idle_disable_c6(pc_to_gt(pc)); + ret = 0; + goto out; + } + + mutex_lock(&pc->freq_lock); + pc->freq_ready = false; + mutex_unlock(&pc->freq_lock); + + ret = pc_action_shutdown(pc); + if (ret) + goto out; + + if (wait_for_pc_state(pc, SLPC_GLOBAL_STATE_NOT_RUNNING)) { + drm_err(&pc_to_xe(pc)->drm, "GuC PC Shutdown failed\n"); + ret = -EIO; + } + +out: + xe_device_mem_access_put(pc_to_xe(pc)); + return ret; +} + +/** + * xe_guc_pc_fini - Finalize GuC's Power Conservation component + * @pc: Xe_GuC_PC instance + */ +void xe_guc_pc_fini(struct xe_guc_pc *pc) +{ + struct xe_device *xe = pc_to_xe(pc); + + if (xe->info.skip_guc_pc) { + xe_gt_idle_disable_c6(pc_to_gt(pc)); + return; + } + + XE_WARN_ON(xe_guc_pc_gucrc_disable(pc)); + XE_WARN_ON(xe_guc_pc_stop(pc)); + mutex_destroy(&pc->freq_lock); +} + +/** + * xe_guc_pc_init - Initialize GuC's Power Conservation component + * @pc: Xe_GuC_PC instance + */ +int xe_guc_pc_init(struct xe_guc_pc *pc) +{ + struct xe_gt *gt = pc_to_gt(pc); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_device *xe = gt_to_xe(gt); + struct xe_bo *bo; + u32 size = PAGE_ALIGN(sizeof(struct slpc_shared_data)); + + if (xe->info.skip_guc_pc) + return 0; + + mutex_init(&pc->freq_lock); + + bo = xe_managed_bo_create_pin_map(xe, tile, size, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + pc->bo = bo; + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_guc_pc.h b/drivers/gpu/drm/xe/xe_guc_pc.h new file mode 100644 index 000000000000..cecad8e9300b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_pc.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_PC_H_ +#define _XE_GUC_PC_H_ + +#include "xe_guc_pc_types.h" + +int xe_guc_pc_init(struct xe_guc_pc *pc); +void xe_guc_pc_fini(struct xe_guc_pc *pc); +int xe_guc_pc_start(struct xe_guc_pc *pc); +int xe_guc_pc_stop(struct xe_guc_pc *pc); +int xe_guc_pc_gucrc_disable(struct xe_guc_pc *pc); + +u32 xe_guc_pc_get_act_freq(struct xe_guc_pc *pc); +int xe_guc_pc_get_cur_freq(struct xe_guc_pc *pc, u32 *freq); +u32 xe_guc_pc_get_rp0_freq(struct xe_guc_pc *pc); +u32 xe_guc_pc_get_rpe_freq(struct xe_guc_pc *pc); +u32 xe_guc_pc_get_rpn_freq(struct xe_guc_pc *pc); +int xe_guc_pc_get_min_freq(struct xe_guc_pc *pc, u32 *freq); +int xe_guc_pc_set_min_freq(struct xe_guc_pc *pc, u32 freq); +int xe_guc_pc_get_max_freq(struct xe_guc_pc *pc, u32 *freq); +int xe_guc_pc_set_max_freq(struct xe_guc_pc *pc, u32 freq); + +enum xe_gt_idle_state xe_guc_pc_c_status(struct xe_guc_pc *pc); +u64 xe_guc_pc_rc6_residency(struct xe_guc_pc *pc); +u64 xe_guc_pc_mc6_residency(struct xe_guc_pc *pc); +void xe_guc_pc_init_early(struct xe_guc_pc *pc); +#endif /* _XE_GUC_PC_H_ */ diff --git a/drivers/gpu/drm/xe/xe_guc_pc_types.h b/drivers/gpu/drm/xe/xe_guc_pc_types.h new file mode 100644 index 000000000000..2afd0dbc3542 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_pc_types.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_PC_TYPES_H_ +#define _XE_GUC_PC_TYPES_H_ + +#include <linux/mutex.h> +#include <linux/types.h> + +/** + * struct xe_guc_pc - GuC Power Conservation (PC) + */ +struct xe_guc_pc { + /** @bo: GGTT buffer object that is shared with GuC PC */ + struct xe_bo *bo; + /** @rp0_freq: HW RP0 frequency - The Maximum one */ + u32 rp0_freq; + /** @rpe_freq: HW RPe frequency - The Efficient one */ + u32 rpe_freq; + /** @rpn_freq: HW RPN frequency - The Minimum one */ + u32 rpn_freq; + /** @user_requested_min: Stash the minimum requested freq by user */ + u32 user_requested_min; + /** @user_requested_max: Stash the maximum requested freq by user */ + u32 user_requested_max; + /** @freq_lock: Let's protect the frequencies */ + struct mutex freq_lock; + /** @freq_ready: Only handle freq changes, if they are really ready */ + bool freq_ready; +}; + +#endif /* _XE_GUC_PC_TYPES_H_ */ diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c new file mode 100644 index 000000000000..21ac68e3246f --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -0,0 +1,1990 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_guc_submit.h" + +#include <linux/bitfield.h> +#include <linux/bitmap.h> +#include <linux/circ_buf.h> +#include <linux/delay.h> +#include <linux/dma-fence-array.h> + +#include <drm/drm_managed.h> + +#include "abi/guc_actions_abi.h" +#include "abi/guc_klvs_abi.h" +#include "regs/xe_lrc_layout.h" +#include "xe_assert.h" +#include "xe_devcoredump.h" +#include "xe_device.h" +#include "xe_exec_queue.h" +#include "xe_force_wake.h" +#include "xe_gpu_scheduler.h" +#include "xe_gt.h" +#include "xe_guc.h" +#include "xe_guc_ct.h" +#include "xe_guc_exec_queue_types.h" +#include "xe_guc_submit_types.h" +#include "xe_hw_engine.h" +#include "xe_hw_fence.h" +#include "xe_lrc.h" +#include "xe_macros.h" +#include "xe_map.h" +#include "xe_mocs.h" +#include "xe_ring_ops_types.h" +#include "xe_sched_job.h" +#include "xe_trace.h" +#include "xe_vm.h" + +static struct xe_guc * +exec_queue_to_guc(struct xe_exec_queue *q) +{ + return &q->gt->uc.guc; +} + +/* + * Helpers for engine state, using an atomic as some of the bits can transition + * as the same time (e.g. a suspend can be happning at the same time as schedule + * engine done being processed). + */ +#define EXEC_QUEUE_STATE_REGISTERED (1 << 0) +#define ENGINE_STATE_ENABLED (1 << 1) +#define EXEC_QUEUE_STATE_PENDING_ENABLE (1 << 2) +#define EXEC_QUEUE_STATE_PENDING_DISABLE (1 << 3) +#define EXEC_QUEUE_STATE_DESTROYED (1 << 4) +#define ENGINE_STATE_SUSPENDED (1 << 5) +#define EXEC_QUEUE_STATE_RESET (1 << 6) +#define ENGINE_STATE_KILLED (1 << 7) + +static bool exec_queue_registered(struct xe_exec_queue *q) +{ + return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_REGISTERED; +} + +static void set_exec_queue_registered(struct xe_exec_queue *q) +{ + atomic_or(EXEC_QUEUE_STATE_REGISTERED, &q->guc->state); +} + +static void clear_exec_queue_registered(struct xe_exec_queue *q) +{ + atomic_and(~EXEC_QUEUE_STATE_REGISTERED, &q->guc->state); +} + +static bool exec_queue_enabled(struct xe_exec_queue *q) +{ + return atomic_read(&q->guc->state) & ENGINE_STATE_ENABLED; +} + +static void set_exec_queue_enabled(struct xe_exec_queue *q) +{ + atomic_or(ENGINE_STATE_ENABLED, &q->guc->state); +} + +static void clear_exec_queue_enabled(struct xe_exec_queue *q) +{ + atomic_and(~ENGINE_STATE_ENABLED, &q->guc->state); +} + +static bool exec_queue_pending_enable(struct xe_exec_queue *q) +{ + return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_PENDING_ENABLE; +} + +static void set_exec_queue_pending_enable(struct xe_exec_queue *q) +{ + atomic_or(EXEC_QUEUE_STATE_PENDING_ENABLE, &q->guc->state); +} + +static void clear_exec_queue_pending_enable(struct xe_exec_queue *q) +{ + atomic_and(~EXEC_QUEUE_STATE_PENDING_ENABLE, &q->guc->state); +} + +static bool exec_queue_pending_disable(struct xe_exec_queue *q) +{ + return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_PENDING_DISABLE; +} + +static void set_exec_queue_pending_disable(struct xe_exec_queue *q) +{ + atomic_or(EXEC_QUEUE_STATE_PENDING_DISABLE, &q->guc->state); +} + +static void clear_exec_queue_pending_disable(struct xe_exec_queue *q) +{ + atomic_and(~EXEC_QUEUE_STATE_PENDING_DISABLE, &q->guc->state); +} + +static bool exec_queue_destroyed(struct xe_exec_queue *q) +{ + return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_DESTROYED; +} + +static void set_exec_queue_destroyed(struct xe_exec_queue *q) +{ + atomic_or(EXEC_QUEUE_STATE_DESTROYED, &q->guc->state); +} + +static bool exec_queue_banned(struct xe_exec_queue *q) +{ + return (q->flags & EXEC_QUEUE_FLAG_BANNED); +} + +static void set_exec_queue_banned(struct xe_exec_queue *q) +{ + q->flags |= EXEC_QUEUE_FLAG_BANNED; +} + +static bool exec_queue_suspended(struct xe_exec_queue *q) +{ + return atomic_read(&q->guc->state) & ENGINE_STATE_SUSPENDED; +} + +static void set_exec_queue_suspended(struct xe_exec_queue *q) +{ + atomic_or(ENGINE_STATE_SUSPENDED, &q->guc->state); +} + +static void clear_exec_queue_suspended(struct xe_exec_queue *q) +{ + atomic_and(~ENGINE_STATE_SUSPENDED, &q->guc->state); +} + +static bool exec_queue_reset(struct xe_exec_queue *q) +{ + return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_RESET; +} + +static void set_exec_queue_reset(struct xe_exec_queue *q) +{ + atomic_or(EXEC_QUEUE_STATE_RESET, &q->guc->state); +} + +static bool exec_queue_killed(struct xe_exec_queue *q) +{ + return atomic_read(&q->guc->state) & ENGINE_STATE_KILLED; +} + +static void set_exec_queue_killed(struct xe_exec_queue *q) +{ + atomic_or(ENGINE_STATE_KILLED, &q->guc->state); +} + +static bool exec_queue_killed_or_banned(struct xe_exec_queue *q) +{ + return exec_queue_killed(q) || exec_queue_banned(q); +} + +#ifdef CONFIG_PROVE_LOCKING +static int alloc_submit_wq(struct xe_guc *guc) +{ + int i; + + for (i = 0; i < NUM_SUBMIT_WQ; ++i) { + guc->submission_state.submit_wq_pool[i] = + alloc_ordered_workqueue("submit_wq", 0); + if (!guc->submission_state.submit_wq_pool[i]) + goto err_free; + } + + return 0; + +err_free: + while (i) + destroy_workqueue(guc->submission_state.submit_wq_pool[--i]); + + return -ENOMEM; +} + +static void free_submit_wq(struct xe_guc *guc) +{ + int i; + + for (i = 0; i < NUM_SUBMIT_WQ; ++i) + destroy_workqueue(guc->submission_state.submit_wq_pool[i]); +} + +static struct workqueue_struct *get_submit_wq(struct xe_guc *guc) +{ + int idx = guc->submission_state.submit_wq_idx++ % NUM_SUBMIT_WQ; + + return guc->submission_state.submit_wq_pool[idx]; +} +#else +static int alloc_submit_wq(struct xe_guc *guc) +{ + return 0; +} + +static void free_submit_wq(struct xe_guc *guc) +{ + +} + +static struct workqueue_struct *get_submit_wq(struct xe_guc *guc) +{ + return NULL; +} +#endif + +static void guc_submit_fini(struct drm_device *drm, void *arg) +{ + struct xe_guc *guc = arg; + + xa_destroy(&guc->submission_state.exec_queue_lookup); + ida_destroy(&guc->submission_state.guc_ids); + bitmap_free(guc->submission_state.guc_ids_bitmap); + free_submit_wq(guc); + mutex_destroy(&guc->submission_state.lock); +} + +#define GUC_ID_MAX 65535 +#define GUC_ID_NUMBER_MLRC 4096 +#define GUC_ID_NUMBER_SLRC (GUC_ID_MAX - GUC_ID_NUMBER_MLRC) +#define GUC_ID_START_MLRC GUC_ID_NUMBER_SLRC + +static const struct xe_exec_queue_ops guc_exec_queue_ops; + +static void primelockdep(struct xe_guc *guc) +{ + if (!IS_ENABLED(CONFIG_LOCKDEP)) + return; + + fs_reclaim_acquire(GFP_KERNEL); + + mutex_lock(&guc->submission_state.lock); + might_lock(&guc->submission_state.suspend.lock); + mutex_unlock(&guc->submission_state.lock); + + fs_reclaim_release(GFP_KERNEL); +} + +int xe_guc_submit_init(struct xe_guc *guc) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_gt *gt = guc_to_gt(guc); + int err; + + guc->submission_state.guc_ids_bitmap = + bitmap_zalloc(GUC_ID_NUMBER_MLRC, GFP_KERNEL); + if (!guc->submission_state.guc_ids_bitmap) + return -ENOMEM; + + err = alloc_submit_wq(guc); + if (err) { + bitmap_free(guc->submission_state.guc_ids_bitmap); + return err; + } + + gt->exec_queue_ops = &guc_exec_queue_ops; + + mutex_init(&guc->submission_state.lock); + xa_init(&guc->submission_state.exec_queue_lookup); + ida_init(&guc->submission_state.guc_ids); + + spin_lock_init(&guc->submission_state.suspend.lock); + guc->submission_state.suspend.context = dma_fence_context_alloc(1); + + primelockdep(guc); + + err = drmm_add_action_or_reset(&xe->drm, guc_submit_fini, guc); + if (err) + return err; + + return 0; +} + +static void __release_guc_id(struct xe_guc *guc, struct xe_exec_queue *q, u32 xa_count) +{ + int i; + + lockdep_assert_held(&guc->submission_state.lock); + + for (i = 0; i < xa_count; ++i) + xa_erase(&guc->submission_state.exec_queue_lookup, q->guc->id + i); + + if (xe_exec_queue_is_parallel(q)) + bitmap_release_region(guc->submission_state.guc_ids_bitmap, + q->guc->id - GUC_ID_START_MLRC, + order_base_2(q->width)); + else + ida_simple_remove(&guc->submission_state.guc_ids, q->guc->id); +} + +static int alloc_guc_id(struct xe_guc *guc, struct xe_exec_queue *q) +{ + int ret; + void *ptr; + int i; + + /* + * Must use GFP_NOWAIT as this lock is in the dma fence signalling path, + * worse case user gets -ENOMEM on engine create and has to try again. + * + * FIXME: Have caller pre-alloc or post-alloc /w GFP_KERNEL to prevent + * failure. + */ + lockdep_assert_held(&guc->submission_state.lock); + + if (xe_exec_queue_is_parallel(q)) { + void *bitmap = guc->submission_state.guc_ids_bitmap; + + ret = bitmap_find_free_region(bitmap, GUC_ID_NUMBER_MLRC, + order_base_2(q->width)); + } else { + ret = ida_simple_get(&guc->submission_state.guc_ids, 0, + GUC_ID_NUMBER_SLRC, GFP_NOWAIT); + } + if (ret < 0) + return ret; + + q->guc->id = ret; + if (xe_exec_queue_is_parallel(q)) + q->guc->id += GUC_ID_START_MLRC; + + for (i = 0; i < q->width; ++i) { + ptr = xa_store(&guc->submission_state.exec_queue_lookup, + q->guc->id + i, q, GFP_NOWAIT); + if (IS_ERR(ptr)) { + ret = PTR_ERR(ptr); + goto err_release; + } + } + + return 0; + +err_release: + __release_guc_id(guc, q, i); + + return ret; +} + +static void release_guc_id(struct xe_guc *guc, struct xe_exec_queue *q) +{ + mutex_lock(&guc->submission_state.lock); + __release_guc_id(guc, q, q->width); + mutex_unlock(&guc->submission_state.lock); +} + +struct exec_queue_policy { + u32 count; + struct guc_update_exec_queue_policy h2g; +}; + +static u32 __guc_exec_queue_policy_action_size(struct exec_queue_policy *policy) +{ + size_t bytes = sizeof(policy->h2g.header) + + (sizeof(policy->h2g.klv[0]) * policy->count); + + return bytes / sizeof(u32); +} + +static void __guc_exec_queue_policy_start_klv(struct exec_queue_policy *policy, + u16 guc_id) +{ + policy->h2g.header.action = + XE_GUC_ACTION_HOST2GUC_UPDATE_CONTEXT_POLICIES; + policy->h2g.header.guc_id = guc_id; + policy->count = 0; +} + +#define MAKE_EXEC_QUEUE_POLICY_ADD(func, id) \ +static void __guc_exec_queue_policy_add_##func(struct exec_queue_policy *policy, \ + u32 data) \ +{ \ + XE_WARN_ON(policy->count >= GUC_CONTEXT_POLICIES_KLV_NUM_IDS); \ +\ + policy->h2g.klv[policy->count].kl = \ + FIELD_PREP(GUC_KLV_0_KEY, \ + GUC_CONTEXT_POLICIES_KLV_ID_##id) | \ + FIELD_PREP(GUC_KLV_0_LEN, 1); \ + policy->h2g.klv[policy->count].value = data; \ + policy->count++; \ +} + +MAKE_EXEC_QUEUE_POLICY_ADD(execution_quantum, EXECUTION_QUANTUM) +MAKE_EXEC_QUEUE_POLICY_ADD(preemption_timeout, PREEMPTION_TIMEOUT) +MAKE_EXEC_QUEUE_POLICY_ADD(priority, SCHEDULING_PRIORITY) +#undef MAKE_EXEC_QUEUE_POLICY_ADD + +static const int xe_exec_queue_prio_to_guc[] = { + [XE_EXEC_QUEUE_PRIORITY_LOW] = GUC_CLIENT_PRIORITY_NORMAL, + [XE_EXEC_QUEUE_PRIORITY_NORMAL] = GUC_CLIENT_PRIORITY_KMD_NORMAL, + [XE_EXEC_QUEUE_PRIORITY_HIGH] = GUC_CLIENT_PRIORITY_HIGH, + [XE_EXEC_QUEUE_PRIORITY_KERNEL] = GUC_CLIENT_PRIORITY_KMD_HIGH, +}; + +static void init_policies(struct xe_guc *guc, struct xe_exec_queue *q) +{ + struct exec_queue_policy policy; + struct xe_device *xe = guc_to_xe(guc); + enum xe_exec_queue_priority prio = q->priority; + u32 timeslice_us = q->sched_props.timeslice_us; + u32 preempt_timeout_us = q->sched_props.preempt_timeout_us; + + xe_assert(xe, exec_queue_registered(q)); + + __guc_exec_queue_policy_start_klv(&policy, q->guc->id); + __guc_exec_queue_policy_add_priority(&policy, xe_exec_queue_prio_to_guc[prio]); + __guc_exec_queue_policy_add_execution_quantum(&policy, timeslice_us); + __guc_exec_queue_policy_add_preemption_timeout(&policy, preempt_timeout_us); + + xe_guc_ct_send(&guc->ct, (u32 *)&policy.h2g, + __guc_exec_queue_policy_action_size(&policy), 0, 0); +} + +static void set_min_preemption_timeout(struct xe_guc *guc, struct xe_exec_queue *q) +{ + struct exec_queue_policy policy; + + __guc_exec_queue_policy_start_klv(&policy, q->guc->id); + __guc_exec_queue_policy_add_preemption_timeout(&policy, 1); + + xe_guc_ct_send(&guc->ct, (u32 *)&policy.h2g, + __guc_exec_queue_policy_action_size(&policy), 0, 0); +} + +#define parallel_read(xe_, map_, field_) \ + xe_map_rd_field(xe_, &map_, 0, struct guc_submit_parallel_scratch, \ + field_) +#define parallel_write(xe_, map_, field_, val_) \ + xe_map_wr_field(xe_, &map_, 0, struct guc_submit_parallel_scratch, \ + field_, val_) + +static void __register_mlrc_engine(struct xe_guc *guc, + struct xe_exec_queue *q, + struct guc_ctxt_registration_info *info) +{ +#define MAX_MLRC_REG_SIZE (13 + XE_HW_ENGINE_MAX_INSTANCE * 2) + struct xe_device *xe = guc_to_xe(guc); + u32 action[MAX_MLRC_REG_SIZE]; + int len = 0; + int i; + + xe_assert(xe, xe_exec_queue_is_parallel(q)); + + action[len++] = XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC; + action[len++] = info->flags; + action[len++] = info->context_idx; + action[len++] = info->engine_class; + action[len++] = info->engine_submit_mask; + action[len++] = info->wq_desc_lo; + action[len++] = info->wq_desc_hi; + action[len++] = info->wq_base_lo; + action[len++] = info->wq_base_hi; + action[len++] = info->wq_size; + action[len++] = q->width; + action[len++] = info->hwlrca_lo; + action[len++] = info->hwlrca_hi; + + for (i = 1; i < q->width; ++i) { + struct xe_lrc *lrc = q->lrc + i; + + action[len++] = lower_32_bits(xe_lrc_descriptor(lrc)); + action[len++] = upper_32_bits(xe_lrc_descriptor(lrc)); + } + + xe_assert(xe, len <= MAX_MLRC_REG_SIZE); +#undef MAX_MLRC_REG_SIZE + + xe_guc_ct_send(&guc->ct, action, len, 0, 0); +} + +static void __register_engine(struct xe_guc *guc, + struct guc_ctxt_registration_info *info) +{ + u32 action[] = { + XE_GUC_ACTION_REGISTER_CONTEXT, + info->flags, + info->context_idx, + info->engine_class, + info->engine_submit_mask, + info->wq_desc_lo, + info->wq_desc_hi, + info->wq_base_lo, + info->wq_base_hi, + info->wq_size, + info->hwlrca_lo, + info->hwlrca_hi, + }; + + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), 0, 0); +} + +static void register_engine(struct xe_exec_queue *q) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + struct xe_lrc *lrc = q->lrc; + struct guc_ctxt_registration_info info; + + xe_assert(xe, !exec_queue_registered(q)); + + memset(&info, 0, sizeof(info)); + info.context_idx = q->guc->id; + info.engine_class = xe_engine_class_to_guc_class(q->class); + info.engine_submit_mask = q->logical_mask; + info.hwlrca_lo = lower_32_bits(xe_lrc_descriptor(lrc)); + info.hwlrca_hi = upper_32_bits(xe_lrc_descriptor(lrc)); + info.flags = CONTEXT_REGISTRATION_FLAG_KMD; + + if (xe_exec_queue_is_parallel(q)) { + u32 ggtt_addr = xe_lrc_parallel_ggtt_addr(lrc); + struct iosys_map map = xe_lrc_parallel_map(lrc); + + info.wq_desc_lo = lower_32_bits(ggtt_addr + + offsetof(struct guc_submit_parallel_scratch, wq_desc)); + info.wq_desc_hi = upper_32_bits(ggtt_addr + + offsetof(struct guc_submit_parallel_scratch, wq_desc)); + info.wq_base_lo = lower_32_bits(ggtt_addr + + offsetof(struct guc_submit_parallel_scratch, wq[0])); + info.wq_base_hi = upper_32_bits(ggtt_addr + + offsetof(struct guc_submit_parallel_scratch, wq[0])); + info.wq_size = WQ_SIZE; + + q->guc->wqi_head = 0; + q->guc->wqi_tail = 0; + xe_map_memset(xe, &map, 0, 0, PARALLEL_SCRATCH_SIZE - WQ_SIZE); + parallel_write(xe, map, wq_desc.wq_status, WQ_STATUS_ACTIVE); + } + + /* + * We must keep a reference for LR engines if engine is registered with + * the GuC as jobs signal immediately and can't destroy an engine if the + * GuC has a reference to it. + */ + if (xe_exec_queue_is_lr(q)) + xe_exec_queue_get(q); + + set_exec_queue_registered(q); + trace_xe_exec_queue_register(q); + if (xe_exec_queue_is_parallel(q)) + __register_mlrc_engine(guc, q, &info); + else + __register_engine(guc, &info); + init_policies(guc, q); +} + +static u32 wq_space_until_wrap(struct xe_exec_queue *q) +{ + return (WQ_SIZE - q->guc->wqi_tail); +} + +static int wq_wait_for_space(struct xe_exec_queue *q, u32 wqi_size) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + struct iosys_map map = xe_lrc_parallel_map(q->lrc); + unsigned int sleep_period_ms = 1; + +#define AVAILABLE_SPACE \ + CIRC_SPACE(q->guc->wqi_tail, q->guc->wqi_head, WQ_SIZE) + if (wqi_size > AVAILABLE_SPACE) { +try_again: + q->guc->wqi_head = parallel_read(xe, map, wq_desc.head); + if (wqi_size > AVAILABLE_SPACE) { + if (sleep_period_ms == 1024) { + xe_gt_reset_async(q->gt); + return -ENODEV; + } + + msleep(sleep_period_ms); + sleep_period_ms <<= 1; + goto try_again; + } + } +#undef AVAILABLE_SPACE + + return 0; +} + +static int wq_noop_append(struct xe_exec_queue *q) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + struct iosys_map map = xe_lrc_parallel_map(q->lrc); + u32 len_dw = wq_space_until_wrap(q) / sizeof(u32) - 1; + + if (wq_wait_for_space(q, wq_space_until_wrap(q))) + return -ENODEV; + + xe_assert(xe, FIELD_FIT(WQ_LEN_MASK, len_dw)); + + parallel_write(xe, map, wq[q->guc->wqi_tail / sizeof(u32)], + FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_NOOP) | + FIELD_PREP(WQ_LEN_MASK, len_dw)); + q->guc->wqi_tail = 0; + + return 0; +} + +static void wq_item_append(struct xe_exec_queue *q) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + struct iosys_map map = xe_lrc_parallel_map(q->lrc); +#define WQ_HEADER_SIZE 4 /* Includes 1 LRC address too */ + u32 wqi[XE_HW_ENGINE_MAX_INSTANCE + (WQ_HEADER_SIZE - 1)]; + u32 wqi_size = (q->width + (WQ_HEADER_SIZE - 1)) * sizeof(u32); + u32 len_dw = (wqi_size / sizeof(u32)) - 1; + int i = 0, j; + + if (wqi_size > wq_space_until_wrap(q)) { + if (wq_noop_append(q)) + return; + } + if (wq_wait_for_space(q, wqi_size)) + return; + + wqi[i++] = FIELD_PREP(WQ_TYPE_MASK, WQ_TYPE_MULTI_LRC) | + FIELD_PREP(WQ_LEN_MASK, len_dw); + wqi[i++] = xe_lrc_descriptor(q->lrc); + wqi[i++] = FIELD_PREP(WQ_GUC_ID_MASK, q->guc->id) | + FIELD_PREP(WQ_RING_TAIL_MASK, q->lrc->ring.tail / sizeof(u64)); + wqi[i++] = 0; + for (j = 1; j < q->width; ++j) { + struct xe_lrc *lrc = q->lrc + j; + + wqi[i++] = lrc->ring.tail / sizeof(u64); + } + + xe_assert(xe, i == wqi_size / sizeof(u32)); + + iosys_map_incr(&map, offsetof(struct guc_submit_parallel_scratch, + wq[q->guc->wqi_tail / sizeof(u32)])); + xe_map_memcpy_to(xe, &map, 0, wqi, wqi_size); + q->guc->wqi_tail += wqi_size; + xe_assert(xe, q->guc->wqi_tail <= WQ_SIZE); + + xe_device_wmb(xe); + + map = xe_lrc_parallel_map(q->lrc); + parallel_write(xe, map, wq_desc.tail, q->guc->wqi_tail); +} + +#define RESUME_PENDING ~0x0ull +static void submit_exec_queue(struct xe_exec_queue *q) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + struct xe_lrc *lrc = q->lrc; + u32 action[3]; + u32 g2h_len = 0; + u32 num_g2h = 0; + int len = 0; + bool extra_submit = false; + + xe_assert(xe, exec_queue_registered(q)); + + if (xe_exec_queue_is_parallel(q)) + wq_item_append(q); + else + xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); + + if (exec_queue_suspended(q) && !xe_exec_queue_is_parallel(q)) + return; + + if (!exec_queue_enabled(q) && !exec_queue_suspended(q)) { + action[len++] = XE_GUC_ACTION_SCHED_CONTEXT_MODE_SET; + action[len++] = q->guc->id; + action[len++] = GUC_CONTEXT_ENABLE; + g2h_len = G2H_LEN_DW_SCHED_CONTEXT_MODE_SET; + num_g2h = 1; + if (xe_exec_queue_is_parallel(q)) + extra_submit = true; + + q->guc->resume_time = RESUME_PENDING; + set_exec_queue_pending_enable(q); + set_exec_queue_enabled(q); + trace_xe_exec_queue_scheduling_enable(q); + } else { + action[len++] = XE_GUC_ACTION_SCHED_CONTEXT; + action[len++] = q->guc->id; + trace_xe_exec_queue_submit(q); + } + + xe_guc_ct_send(&guc->ct, action, len, g2h_len, num_g2h); + + if (extra_submit) { + len = 0; + action[len++] = XE_GUC_ACTION_SCHED_CONTEXT; + action[len++] = q->guc->id; + trace_xe_exec_queue_submit(q); + + xe_guc_ct_send(&guc->ct, action, len, 0, 0); + } +} + +static struct dma_fence * +guc_exec_queue_run_job(struct drm_sched_job *drm_job) +{ + struct xe_sched_job *job = to_xe_sched_job(drm_job); + struct xe_exec_queue *q = job->q; + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + bool lr = xe_exec_queue_is_lr(q); + + xe_assert(xe, !(exec_queue_destroyed(q) || exec_queue_pending_disable(q)) || + exec_queue_banned(q) || exec_queue_suspended(q)); + + trace_xe_sched_job_run(job); + + if (!exec_queue_killed_or_banned(q) && !xe_sched_job_is_error(job)) { + if (!exec_queue_registered(q)) + register_engine(q); + if (!lr) /* LR jobs are emitted in the exec IOCTL */ + q->ring_ops->emit_job(job); + submit_exec_queue(q); + } + + if (lr) { + xe_sched_job_set_error(job, -EOPNOTSUPP); + return NULL; + } else if (test_and_set_bit(JOB_FLAG_SUBMIT, &job->fence->flags)) { + return job->fence; + } else { + return dma_fence_get(job->fence); + } +} + +static void guc_exec_queue_free_job(struct drm_sched_job *drm_job) +{ + struct xe_sched_job *job = to_xe_sched_job(drm_job); + + trace_xe_sched_job_free(job); + xe_sched_job_put(job); +} + +static int guc_read_stopped(struct xe_guc *guc) +{ + return atomic_read(&guc->submission_state.stopped); +} + +#define MAKE_SCHED_CONTEXT_ACTION(q, enable_disable) \ + u32 action[] = { \ + XE_GUC_ACTION_SCHED_CONTEXT_MODE_SET, \ + q->guc->id, \ + GUC_CONTEXT_##enable_disable, \ + } + +static void disable_scheduling_deregister(struct xe_guc *guc, + struct xe_exec_queue *q) +{ + MAKE_SCHED_CONTEXT_ACTION(q, DISABLE); + struct xe_device *xe = guc_to_xe(guc); + int ret; + + set_min_preemption_timeout(guc, q); + smp_rmb(); + ret = wait_event_timeout(guc->ct.wq, !exec_queue_pending_enable(q) || + guc_read_stopped(guc), HZ * 5); + if (!ret) { + struct xe_gpu_scheduler *sched = &q->guc->sched; + + drm_warn(&xe->drm, "Pending enable failed to respond"); + xe_sched_submission_start(sched); + xe_gt_reset_async(q->gt); + xe_sched_tdr_queue_imm(sched); + return; + } + + clear_exec_queue_enabled(q); + set_exec_queue_pending_disable(q); + set_exec_queue_destroyed(q); + trace_xe_exec_queue_scheduling_disable(q); + + /* + * Reserve space for both G2H here as the 2nd G2H is sent from a G2H + * handler and we are not allowed to reserved G2H space in handlers. + */ + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), + G2H_LEN_DW_SCHED_CONTEXT_MODE_SET + + G2H_LEN_DW_DEREGISTER_CONTEXT, 2); +} + +static void guc_exec_queue_print(struct xe_exec_queue *q, struct drm_printer *p); + +#if IS_ENABLED(CONFIG_DRM_XE_SIMPLE_ERROR_CAPTURE) +static void simple_error_capture(struct xe_exec_queue *q) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct drm_printer p = drm_err_printer(""); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + u32 adj_logical_mask = q->logical_mask; + u32 width_mask = (0x1 << q->width) - 1; + int i; + bool cookie; + + if (q->vm && !q->vm->error_capture.capture_once) { + q->vm->error_capture.capture_once = true; + cookie = dma_fence_begin_signalling(); + for (i = 0; q->width > 1 && i < XE_HW_ENGINE_MAX_INSTANCE;) { + if (adj_logical_mask & BIT(i)) { + adj_logical_mask |= width_mask << i; + i += q->width; + } else { + ++i; + } + } + + xe_force_wake_get(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL); + xe_guc_ct_print(&guc->ct, &p, true); + guc_exec_queue_print(q, &p); + for_each_hw_engine(hwe, guc_to_gt(guc), id) { + if (hwe->class != q->hwe->class || + !(BIT(hwe->logical_instance) & adj_logical_mask)) + continue; + xe_hw_engine_print(hwe, &p); + } + xe_analyze_vm(&p, q->vm, q->gt->info.id); + xe_force_wake_put(gt_to_fw(guc_to_gt(guc)), XE_FORCEWAKE_ALL); + dma_fence_end_signalling(cookie); + } +} +#else +static void simple_error_capture(struct xe_exec_queue *q) +{ +} +#endif + +static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + + /** to wakeup xe_wait_user_fence ioctl if exec queue is reset */ + wake_up_all(&xe->ufence_wq); + + if (xe_exec_queue_is_lr(q)) + queue_work(guc_to_gt(guc)->ordered_wq, &q->guc->lr_tdr); + else + xe_sched_tdr_queue_imm(&q->guc->sched); +} + +static void xe_guc_exec_queue_lr_cleanup(struct work_struct *w) +{ + struct xe_guc_exec_queue *ge = + container_of(w, struct xe_guc_exec_queue, lr_tdr); + struct xe_exec_queue *q = ge->q; + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + struct xe_gpu_scheduler *sched = &ge->sched; + + xe_assert(xe, xe_exec_queue_is_lr(q)); + trace_xe_exec_queue_lr_cleanup(q); + + /* Kill the run_job / process_msg entry points */ + xe_sched_submission_stop(sched); + + /* + * Engine state now mostly stable, disable scheduling / deregister if + * needed. This cleanup routine might be called multiple times, where + * the actual async engine deregister drops the final engine ref. + * Calling disable_scheduling_deregister will mark the engine as + * destroyed and fire off the CT requests to disable scheduling / + * deregister, which we only want to do once. We also don't want to mark + * the engine as pending_disable again as this may race with the + * xe_guc_deregister_done_handler() which treats it as an unexpected + * state. + */ + if (exec_queue_registered(q) && !exec_queue_destroyed(q)) { + struct xe_guc *guc = exec_queue_to_guc(q); + int ret; + + set_exec_queue_banned(q); + disable_scheduling_deregister(guc, q); + + /* + * Must wait for scheduling to be disabled before signalling + * any fences, if GT broken the GT reset code should signal us. + */ + ret = wait_event_timeout(guc->ct.wq, + !exec_queue_pending_disable(q) || + guc_read_stopped(guc), HZ * 5); + if (!ret) { + drm_warn(&xe->drm, "Schedule disable failed to respond"); + xe_sched_submission_start(sched); + xe_gt_reset_async(q->gt); + return; + } + } + + xe_sched_submission_start(sched); +} + +static enum drm_gpu_sched_stat +guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) +{ + struct xe_sched_job *job = to_xe_sched_job(drm_job); + struct xe_sched_job *tmp_job; + struct xe_exec_queue *q = job->q; + struct xe_gpu_scheduler *sched = &q->guc->sched; + struct xe_device *xe = guc_to_xe(exec_queue_to_guc(q)); + int err = -ETIME; + int i = 0; + + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) { + xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_KERNEL)); + xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q))); + + drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx", + xe_sched_job_seqno(job), q->guc->id, q->flags); + simple_error_capture(q); + xe_devcoredump(q); + } else { + drm_dbg(&xe->drm, "Timedout signaled job: seqno=%u, guc_id=%d, flags=0x%lx", + xe_sched_job_seqno(job), q->guc->id, q->flags); + } + trace_xe_sched_job_timedout(job); + + /* Kill the run_job entry point */ + xe_sched_submission_stop(sched); + + /* + * Kernel jobs should never fail, nor should VM jobs if they do + * somethings has gone wrong and the GT needs a reset + */ + if (q->flags & EXEC_QUEUE_FLAG_KERNEL || + (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q))) { + if (!xe_sched_invalidate_job(job, 2)) { + xe_sched_add_pending_job(sched, job); + xe_sched_submission_start(sched); + xe_gt_reset_async(q->gt); + goto out; + } + } + + /* Engine state now stable, disable scheduling if needed */ + if (exec_queue_registered(q)) { + struct xe_guc *guc = exec_queue_to_guc(q); + int ret; + + if (exec_queue_reset(q)) + err = -EIO; + set_exec_queue_banned(q); + if (!exec_queue_destroyed(q)) { + xe_exec_queue_get(q); + disable_scheduling_deregister(guc, q); + } + + /* + * Must wait for scheduling to be disabled before signalling + * any fences, if GT broken the GT reset code should signal us. + * + * FIXME: Tests can generate a ton of 0x6000 (IOMMU CAT fault + * error) messages which can cause the schedule disable to get + * lost. If this occurs, trigger a GT reset to recover. + */ + smp_rmb(); + ret = wait_event_timeout(guc->ct.wq, + !exec_queue_pending_disable(q) || + guc_read_stopped(guc), HZ * 5); + if (!ret || guc_read_stopped(guc)) { + drm_warn(&xe->drm, "Schedule disable failed to respond"); + xe_sched_add_pending_job(sched, job); + xe_sched_submission_start(sched); + xe_gt_reset_async(q->gt); + xe_sched_tdr_queue_imm(sched); + goto out; + } + } + + /* Stop fence signaling */ + xe_hw_fence_irq_stop(q->fence_irq); + + /* + * Fence state now stable, stop / start scheduler which cleans up any + * fences that are complete + */ + xe_sched_add_pending_job(sched, job); + xe_sched_submission_start(sched); + xe_guc_exec_queue_trigger_cleanup(q); + + /* Mark all outstanding jobs as bad, thus completing them */ + spin_lock(&sched->base.job_list_lock); + list_for_each_entry(tmp_job, &sched->base.pending_list, drm.list) + xe_sched_job_set_error(tmp_job, !i++ ? err : -ECANCELED); + spin_unlock(&sched->base.job_list_lock); + + /* Start fence signaling */ + xe_hw_fence_irq_start(q->fence_irq); + +out: + return DRM_GPU_SCHED_STAT_NOMINAL; +} + +static void __guc_exec_queue_fini_async(struct work_struct *w) +{ + struct xe_guc_exec_queue *ge = + container_of(w, struct xe_guc_exec_queue, fini_async); + struct xe_exec_queue *q = ge->q; + struct xe_guc *guc = exec_queue_to_guc(q); + + trace_xe_exec_queue_destroy(q); + + if (xe_exec_queue_is_lr(q)) + cancel_work_sync(&ge->lr_tdr); + if (q->flags & EXEC_QUEUE_FLAG_PERSISTENT) + xe_device_remove_persistent_exec_queues(gt_to_xe(q->gt), q); + release_guc_id(guc, q); + xe_sched_entity_fini(&ge->entity); + xe_sched_fini(&ge->sched); + + kfree(ge); + xe_exec_queue_fini(q); +} + +static void guc_exec_queue_fini_async(struct xe_exec_queue *q) +{ + INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async); + + /* We must block on kernel engines so slabs are empty on driver unload */ + if (q->flags & EXEC_QUEUE_FLAG_PERMANENT) + __guc_exec_queue_fini_async(&q->guc->fini_async); + else + queue_work(system_wq, &q->guc->fini_async); +} + +static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q) +{ + /* + * Might be done from within the GPU scheduler, need to do async as we + * fini the scheduler when the engine is fini'd, the scheduler can't + * complete fini within itself (circular dependency). Async resolves + * this we and don't really care when everything is fini'd, just that it + * is. + */ + guc_exec_queue_fini_async(q); +} + +static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg) +{ + struct xe_exec_queue *q = msg->private_data; + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + + xe_assert(xe, !(q->flags & EXEC_QUEUE_FLAG_PERMANENT)); + trace_xe_exec_queue_cleanup_entity(q); + + if (exec_queue_registered(q)) + disable_scheduling_deregister(guc, q); + else + __guc_exec_queue_fini(guc, q); +} + +static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q) +{ + return !exec_queue_killed_or_banned(q) && exec_queue_registered(q); +} + +static void __guc_exec_queue_process_msg_set_sched_props(struct xe_sched_msg *msg) +{ + struct xe_exec_queue *q = msg->private_data; + struct xe_guc *guc = exec_queue_to_guc(q); + + if (guc_exec_queue_allowed_to_change_state(q)) + init_policies(guc, q); + kfree(msg); +} + +static void suspend_fence_signal(struct xe_exec_queue *q) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + + xe_assert(xe, exec_queue_suspended(q) || exec_queue_killed(q) || + guc_read_stopped(guc)); + xe_assert(xe, q->guc->suspend_pending); + + q->guc->suspend_pending = false; + smp_wmb(); + wake_up(&q->guc->suspend_wait); +} + +static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg) +{ + struct xe_exec_queue *q = msg->private_data; + struct xe_guc *guc = exec_queue_to_guc(q); + + if (guc_exec_queue_allowed_to_change_state(q) && !exec_queue_suspended(q) && + exec_queue_enabled(q)) { + wait_event(guc->ct.wq, q->guc->resume_time != RESUME_PENDING || + guc_read_stopped(guc)); + + if (!guc_read_stopped(guc)) { + MAKE_SCHED_CONTEXT_ACTION(q, DISABLE); + s64 since_resume_ms = + ktime_ms_delta(ktime_get(), + q->guc->resume_time); + s64 wait_ms = q->vm->preempt.min_run_period_ms - + since_resume_ms; + + if (wait_ms > 0 && q->guc->resume_time) + msleep(wait_ms); + + set_exec_queue_suspended(q); + clear_exec_queue_enabled(q); + set_exec_queue_pending_disable(q); + trace_xe_exec_queue_scheduling_disable(q); + + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), + G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1); + } + } else if (q->guc->suspend_pending) { + set_exec_queue_suspended(q); + suspend_fence_signal(q); + } +} + +static void __guc_exec_queue_process_msg_resume(struct xe_sched_msg *msg) +{ + struct xe_exec_queue *q = msg->private_data; + struct xe_guc *guc = exec_queue_to_guc(q); + + if (guc_exec_queue_allowed_to_change_state(q)) { + MAKE_SCHED_CONTEXT_ACTION(q, ENABLE); + + q->guc->resume_time = RESUME_PENDING; + clear_exec_queue_suspended(q); + set_exec_queue_pending_enable(q); + set_exec_queue_enabled(q); + trace_xe_exec_queue_scheduling_enable(q); + + xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), + G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1); + } else { + clear_exec_queue_suspended(q); + } +} + +#define CLEANUP 1 /* Non-zero values to catch uninitialized msg */ +#define SET_SCHED_PROPS 2 +#define SUSPEND 3 +#define RESUME 4 + +static void guc_exec_queue_process_msg(struct xe_sched_msg *msg) +{ + trace_xe_sched_msg_recv(msg); + + switch (msg->opcode) { + case CLEANUP: + __guc_exec_queue_process_msg_cleanup(msg); + break; + case SET_SCHED_PROPS: + __guc_exec_queue_process_msg_set_sched_props(msg); + break; + case SUSPEND: + __guc_exec_queue_process_msg_suspend(msg); + break; + case RESUME: + __guc_exec_queue_process_msg_resume(msg); + break; + default: + XE_WARN_ON("Unknown message type"); + } +} + +static const struct drm_sched_backend_ops drm_sched_ops = { + .run_job = guc_exec_queue_run_job, + .free_job = guc_exec_queue_free_job, + .timedout_job = guc_exec_queue_timedout_job, +}; + +static const struct xe_sched_backend_ops xe_sched_ops = { + .process_msg = guc_exec_queue_process_msg, +}; + +static int guc_exec_queue_init(struct xe_exec_queue *q) +{ + struct xe_gpu_scheduler *sched; + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + struct xe_guc_exec_queue *ge; + long timeout; + int err; + + xe_assert(xe, xe_device_uc_enabled(guc_to_xe(guc))); + + ge = kzalloc(sizeof(*ge), GFP_KERNEL); + if (!ge) + return -ENOMEM; + + q->guc = ge; + ge->q = q; + init_waitqueue_head(&ge->suspend_wait); + + timeout = (q->vm && xe_vm_in_lr_mode(q->vm)) ? MAX_SCHEDULE_TIMEOUT : + q->hwe->eclass->sched_props.job_timeout_ms; + err = xe_sched_init(&ge->sched, &drm_sched_ops, &xe_sched_ops, + get_submit_wq(guc), + q->lrc[0].ring.size / MAX_JOB_SIZE_BYTES, 64, + timeout, guc_to_gt(guc)->ordered_wq, NULL, + q->name, gt_to_xe(q->gt)->drm.dev); + if (err) + goto err_free; + + sched = &ge->sched; + err = xe_sched_entity_init(&ge->entity, sched); + if (err) + goto err_sched; + q->priority = XE_EXEC_QUEUE_PRIORITY_NORMAL; + + if (xe_exec_queue_is_lr(q)) + INIT_WORK(&q->guc->lr_tdr, xe_guc_exec_queue_lr_cleanup); + + mutex_lock(&guc->submission_state.lock); + + err = alloc_guc_id(guc, q); + if (err) + goto err_entity; + + q->entity = &ge->entity; + + if (guc_read_stopped(guc)) + xe_sched_stop(sched); + + mutex_unlock(&guc->submission_state.lock); + + xe_exec_queue_assign_name(q, q->guc->id); + + trace_xe_exec_queue_create(q); + + return 0; + +err_entity: + xe_sched_entity_fini(&ge->entity); +err_sched: + xe_sched_fini(&ge->sched); +err_free: + kfree(ge); + + return err; +} + +static void guc_exec_queue_kill(struct xe_exec_queue *q) +{ + trace_xe_exec_queue_kill(q); + set_exec_queue_killed(q); + xe_guc_exec_queue_trigger_cleanup(q); +} + +static void guc_exec_queue_add_msg(struct xe_exec_queue *q, struct xe_sched_msg *msg, + u32 opcode) +{ + INIT_LIST_HEAD(&msg->link); + msg->opcode = opcode; + msg->private_data = q; + + trace_xe_sched_msg_add(msg); + xe_sched_add_msg(&q->guc->sched, msg); +} + +#define STATIC_MSG_CLEANUP 0 +#define STATIC_MSG_SUSPEND 1 +#define STATIC_MSG_RESUME 2 +static void guc_exec_queue_fini(struct xe_exec_queue *q) +{ + struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP; + + if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT)) + guc_exec_queue_add_msg(q, msg, CLEANUP); + else + __guc_exec_queue_fini(exec_queue_to_guc(q), q); +} + +static int guc_exec_queue_set_priority(struct xe_exec_queue *q, + enum xe_exec_queue_priority priority) +{ + struct xe_sched_msg *msg; + + if (q->priority == priority || exec_queue_killed_or_banned(q)) + return 0; + + msg = kmalloc(sizeof(*msg), GFP_KERNEL); + if (!msg) + return -ENOMEM; + + guc_exec_queue_add_msg(q, msg, SET_SCHED_PROPS); + q->priority = priority; + + return 0; +} + +static int guc_exec_queue_set_timeslice(struct xe_exec_queue *q, u32 timeslice_us) +{ + struct xe_sched_msg *msg; + + if (q->sched_props.timeslice_us == timeslice_us || + exec_queue_killed_or_banned(q)) + return 0; + + msg = kmalloc(sizeof(*msg), GFP_KERNEL); + if (!msg) + return -ENOMEM; + + q->sched_props.timeslice_us = timeslice_us; + guc_exec_queue_add_msg(q, msg, SET_SCHED_PROPS); + + return 0; +} + +static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q, + u32 preempt_timeout_us) +{ + struct xe_sched_msg *msg; + + if (q->sched_props.preempt_timeout_us == preempt_timeout_us || + exec_queue_killed_or_banned(q)) + return 0; + + msg = kmalloc(sizeof(*msg), GFP_KERNEL); + if (!msg) + return -ENOMEM; + + q->sched_props.preempt_timeout_us = preempt_timeout_us; + guc_exec_queue_add_msg(q, msg, SET_SCHED_PROPS); + + return 0; +} + +static int guc_exec_queue_set_job_timeout(struct xe_exec_queue *q, u32 job_timeout_ms) +{ + struct xe_gpu_scheduler *sched = &q->guc->sched; + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + + xe_assert(xe, !exec_queue_registered(q)); + xe_assert(xe, !exec_queue_banned(q)); + xe_assert(xe, !exec_queue_killed(q)); + + sched->base.timeout = job_timeout_ms; + + return 0; +} + +static int guc_exec_queue_suspend(struct xe_exec_queue *q) +{ + struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND; + + if (exec_queue_killed_or_banned(q) || q->guc->suspend_pending) + return -EINVAL; + + q->guc->suspend_pending = true; + guc_exec_queue_add_msg(q, msg, SUSPEND); + + return 0; +} + +static void guc_exec_queue_suspend_wait(struct xe_exec_queue *q) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + + wait_event(q->guc->suspend_wait, !q->guc->suspend_pending || + guc_read_stopped(guc)); +} + +static void guc_exec_queue_resume(struct xe_exec_queue *q) +{ + struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_RESUME; + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + + xe_assert(xe, !q->guc->suspend_pending); + + guc_exec_queue_add_msg(q, msg, RESUME); +} + +static bool guc_exec_queue_reset_status(struct xe_exec_queue *q) +{ + return exec_queue_reset(q); +} + +/* + * All of these functions are an abstraction layer which other parts of XE can + * use to trap into the GuC backend. All of these functions, aside from init, + * really shouldn't do much other than trap into the DRM scheduler which + * synchronizes these operations. + */ +static const struct xe_exec_queue_ops guc_exec_queue_ops = { + .init = guc_exec_queue_init, + .kill = guc_exec_queue_kill, + .fini = guc_exec_queue_fini, + .set_priority = guc_exec_queue_set_priority, + .set_timeslice = guc_exec_queue_set_timeslice, + .set_preempt_timeout = guc_exec_queue_set_preempt_timeout, + .set_job_timeout = guc_exec_queue_set_job_timeout, + .suspend = guc_exec_queue_suspend, + .suspend_wait = guc_exec_queue_suspend_wait, + .resume = guc_exec_queue_resume, + .reset_status = guc_exec_queue_reset_status, +}; + +static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q) +{ + struct xe_gpu_scheduler *sched = &q->guc->sched; + + /* Stop scheduling + flush any DRM scheduler operations */ + xe_sched_submission_stop(sched); + + /* Clean up lost G2H + reset engine state */ + if (exec_queue_registered(q)) { + if ((exec_queue_banned(q) && exec_queue_destroyed(q)) || + xe_exec_queue_is_lr(q)) + xe_exec_queue_put(q); + else if (exec_queue_destroyed(q)) + __guc_exec_queue_fini(guc, q); + } + if (q->guc->suspend_pending) { + set_exec_queue_suspended(q); + suspend_fence_signal(q); + } + atomic_and(EXEC_QUEUE_STATE_DESTROYED | ENGINE_STATE_SUSPENDED, + &q->guc->state); + q->guc->resume_time = 0; + trace_xe_exec_queue_stop(q); + + /* + * Ban any engine (aside from kernel and engines used for VM ops) with a + * started but not complete job or if a job has gone through a GT reset + * more than twice. + */ + if (!(q->flags & (EXEC_QUEUE_FLAG_KERNEL | EXEC_QUEUE_FLAG_VM))) { + struct xe_sched_job *job = xe_sched_first_pending_job(sched); + + if (job) { + if ((xe_sched_job_started(job) && + !xe_sched_job_completed(job)) || + xe_sched_invalidate_job(job, 2)) { + trace_xe_sched_job_ban(job); + xe_sched_tdr_queue_imm(&q->guc->sched); + set_exec_queue_banned(q); + } + } + } +} + +int xe_guc_submit_reset_prepare(struct xe_guc *guc) +{ + int ret; + + /* + * Using an atomic here rather than submission_state.lock as this + * function can be called while holding the CT lock (engine reset + * failure). submission_state.lock needs the CT lock to resubmit jobs. + * Atomic is not ideal, but it works to prevent against concurrent reset + * and releasing any TDRs waiting on guc->submission_state.stopped. + */ + ret = atomic_fetch_or(1, &guc->submission_state.stopped); + smp_wmb(); + wake_up_all(&guc->ct.wq); + + return ret; +} + +void xe_guc_submit_reset_wait(struct xe_guc *guc) +{ + wait_event(guc->ct.wq, !guc_read_stopped(guc)); +} + +int xe_guc_submit_stop(struct xe_guc *guc) +{ + struct xe_exec_queue *q; + unsigned long index; + struct xe_device *xe = guc_to_xe(guc); + + xe_assert(xe, guc_read_stopped(guc) == 1); + + mutex_lock(&guc->submission_state.lock); + + xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) + guc_exec_queue_stop(guc, q); + + mutex_unlock(&guc->submission_state.lock); + + /* + * No one can enter the backend at this point, aside from new engine + * creation which is protected by guc->submission_state.lock. + */ + + return 0; +} + +static void guc_exec_queue_start(struct xe_exec_queue *q) +{ + struct xe_gpu_scheduler *sched = &q->guc->sched; + + if (!exec_queue_killed_or_banned(q)) { + int i; + + trace_xe_exec_queue_resubmit(q); + for (i = 0; i < q->width; ++i) + xe_lrc_set_ring_head(q->lrc + i, q->lrc[i].ring.tail); + xe_sched_resubmit_jobs(sched); + } + + xe_sched_submission_start(sched); +} + +int xe_guc_submit_start(struct xe_guc *guc) +{ + struct xe_exec_queue *q; + unsigned long index; + struct xe_device *xe = guc_to_xe(guc); + + xe_assert(xe, guc_read_stopped(guc) == 1); + + mutex_lock(&guc->submission_state.lock); + atomic_dec(&guc->submission_state.stopped); + xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) + guc_exec_queue_start(q); + mutex_unlock(&guc->submission_state.lock); + + wake_up_all(&guc->ct.wq); + + return 0; +} + +static struct xe_exec_queue * +g2h_exec_queue_lookup(struct xe_guc *guc, u32 guc_id) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_exec_queue *q; + + if (unlikely(guc_id >= GUC_ID_MAX)) { + drm_err(&xe->drm, "Invalid guc_id %u", guc_id); + return NULL; + } + + q = xa_load(&guc->submission_state.exec_queue_lookup, guc_id); + if (unlikely(!q)) { + drm_err(&xe->drm, "Not engine present for guc_id %u", guc_id); + return NULL; + } + + xe_assert(xe, guc_id >= q->guc->id); + xe_assert(xe, guc_id < (q->guc->id + q->width)); + + return q; +} + +static void deregister_exec_queue(struct xe_guc *guc, struct xe_exec_queue *q) +{ + u32 action[] = { + XE_GUC_ACTION_DEREGISTER_CONTEXT, + q->guc->id, + }; + + trace_xe_exec_queue_deregister(q); + + xe_guc_ct_send_g2h_handler(&guc->ct, action, ARRAY_SIZE(action)); +} + +int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32 len) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_exec_queue *q; + u32 guc_id = msg[0]; + + if (unlikely(len < 2)) { + drm_err(&xe->drm, "Invalid length %u", len); + return -EPROTO; + } + + q = g2h_exec_queue_lookup(guc, guc_id); + if (unlikely(!q)) + return -EPROTO; + + if (unlikely(!exec_queue_pending_enable(q) && + !exec_queue_pending_disable(q))) { + drm_err(&xe->drm, "Unexpected engine state 0x%04x", + atomic_read(&q->guc->state)); + return -EPROTO; + } + + trace_xe_exec_queue_scheduling_done(q); + + if (exec_queue_pending_enable(q)) { + q->guc->resume_time = ktime_get(); + clear_exec_queue_pending_enable(q); + smp_wmb(); + wake_up_all(&guc->ct.wq); + } else { + clear_exec_queue_pending_disable(q); + if (q->guc->suspend_pending) { + suspend_fence_signal(q); + } else { + if (exec_queue_banned(q)) { + smp_wmb(); + wake_up_all(&guc->ct.wq); + } + deregister_exec_queue(guc, q); + } + } + + return 0; +} + +int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_exec_queue *q; + u32 guc_id = msg[0]; + + if (unlikely(len < 1)) { + drm_err(&xe->drm, "Invalid length %u", len); + return -EPROTO; + } + + q = g2h_exec_queue_lookup(guc, guc_id); + if (unlikely(!q)) + return -EPROTO; + + if (!exec_queue_destroyed(q) || exec_queue_pending_disable(q) || + exec_queue_pending_enable(q) || exec_queue_enabled(q)) { + drm_err(&xe->drm, "Unexpected engine state 0x%04x", + atomic_read(&q->guc->state)); + return -EPROTO; + } + + trace_xe_exec_queue_deregister_done(q); + + clear_exec_queue_registered(q); + + if (exec_queue_banned(q) || xe_exec_queue_is_lr(q)) + xe_exec_queue_put(q); + else + __guc_exec_queue_fini(guc, q); + + return 0; +} + +int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_exec_queue *q; + u32 guc_id = msg[0]; + + if (unlikely(len < 1)) { + drm_err(&xe->drm, "Invalid length %u", len); + return -EPROTO; + } + + q = g2h_exec_queue_lookup(guc, guc_id); + if (unlikely(!q)) + return -EPROTO; + + drm_info(&xe->drm, "Engine reset: guc_id=%d", guc_id); + + /* FIXME: Do error capture, most likely async */ + + trace_xe_exec_queue_reset(q); + + /* + * A banned engine is a NOP at this point (came from + * guc_exec_queue_timedout_job). Otherwise, kick drm scheduler to cancel + * jobs by setting timeout of the job to the minimum value kicking + * guc_exec_queue_timedout_job. + */ + set_exec_queue_reset(q); + if (!exec_queue_banned(q)) + xe_guc_exec_queue_trigger_cleanup(q); + + return 0; +} + +int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg, + u32 len) +{ + struct xe_device *xe = guc_to_xe(guc); + struct xe_exec_queue *q; + u32 guc_id = msg[0]; + + if (unlikely(len < 1)) { + drm_err(&xe->drm, "Invalid length %u", len); + return -EPROTO; + } + + q = g2h_exec_queue_lookup(guc, guc_id); + if (unlikely(!q)) + return -EPROTO; + + drm_dbg(&xe->drm, "Engine memory cat error: guc_id=%d", guc_id); + trace_xe_exec_queue_memory_cat_error(q); + + /* Treat the same as engine reset */ + set_exec_queue_reset(q); + if (!exec_queue_banned(q)) + xe_guc_exec_queue_trigger_cleanup(q); + + return 0; +} + +int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len) +{ + struct xe_device *xe = guc_to_xe(guc); + u8 guc_class, instance; + u32 reason; + + if (unlikely(len != 3)) { + drm_err(&xe->drm, "Invalid length %u", len); + return -EPROTO; + } + + guc_class = msg[0]; + instance = msg[1]; + reason = msg[2]; + + /* Unexpected failure of a hardware feature, log an actual error */ + drm_err(&xe->drm, "GuC engine reset request failed on %d:%d because 0x%08X", + guc_class, instance, reason); + + xe_gt_reset_async(guc_to_gt(guc)); + + return 0; +} + +static void +guc_exec_queue_wq_snapshot_capture(struct xe_exec_queue *q, + struct xe_guc_submit_exec_queue_snapshot *snapshot) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + struct iosys_map map = xe_lrc_parallel_map(q->lrc); + int i; + + snapshot->guc.wqi_head = q->guc->wqi_head; + snapshot->guc.wqi_tail = q->guc->wqi_tail; + snapshot->parallel.wq_desc.head = parallel_read(xe, map, wq_desc.head); + snapshot->parallel.wq_desc.tail = parallel_read(xe, map, wq_desc.tail); + snapshot->parallel.wq_desc.status = parallel_read(xe, map, + wq_desc.wq_status); + + if (snapshot->parallel.wq_desc.head != + snapshot->parallel.wq_desc.tail) { + for (i = snapshot->parallel.wq_desc.head; + i != snapshot->parallel.wq_desc.tail; + i = (i + sizeof(u32)) % WQ_SIZE) + snapshot->parallel.wq[i / sizeof(u32)] = + parallel_read(xe, map, wq[i / sizeof(u32)]); + } +} + +static void +guc_exec_queue_wq_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snapshot, + struct drm_printer *p) +{ + int i; + + drm_printf(p, "\tWQ head: %u (internal), %d (memory)\n", + snapshot->guc.wqi_head, snapshot->parallel.wq_desc.head); + drm_printf(p, "\tWQ tail: %u (internal), %d (memory)\n", + snapshot->guc.wqi_tail, snapshot->parallel.wq_desc.tail); + drm_printf(p, "\tWQ status: %u\n", snapshot->parallel.wq_desc.status); + + if (snapshot->parallel.wq_desc.head != + snapshot->parallel.wq_desc.tail) { + for (i = snapshot->parallel.wq_desc.head; + i != snapshot->parallel.wq_desc.tail; + i = (i + sizeof(u32)) % WQ_SIZE) + drm_printf(p, "\tWQ[%zu]: 0x%08x\n", i / sizeof(u32), + snapshot->parallel.wq[i / sizeof(u32)]); + } +} + +/** + * xe_guc_exec_queue_snapshot_capture - Take a quick snapshot of the GuC Engine. + * @q: Xe exec queue. + * + * This can be printed out in a later stage like during dev_coredump + * analysis. + * + * Returns: a GuC Submit Engine snapshot object that must be freed by the + * caller, using `xe_guc_exec_queue_snapshot_free`. + */ +struct xe_guc_submit_exec_queue_snapshot * +xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q) +{ + struct xe_guc *guc = exec_queue_to_guc(q); + struct xe_device *xe = guc_to_xe(guc); + struct xe_gpu_scheduler *sched = &q->guc->sched; + struct xe_sched_job *job; + struct xe_guc_submit_exec_queue_snapshot *snapshot; + int i; + + snapshot = kzalloc(sizeof(*snapshot), GFP_ATOMIC); + + if (!snapshot) { + drm_err(&xe->drm, "Skipping GuC Engine snapshot entirely.\n"); + return NULL; + } + + snapshot->guc.id = q->guc->id; + memcpy(&snapshot->name, &q->name, sizeof(snapshot->name)); + snapshot->class = q->class; + snapshot->logical_mask = q->logical_mask; + snapshot->width = q->width; + snapshot->refcount = kref_read(&q->refcount); + snapshot->sched_timeout = sched->base.timeout; + snapshot->sched_props.timeslice_us = q->sched_props.timeslice_us; + snapshot->sched_props.preempt_timeout_us = + q->sched_props.preempt_timeout_us; + + snapshot->lrc = kmalloc_array(q->width, sizeof(struct lrc_snapshot), + GFP_ATOMIC); + + if (!snapshot->lrc) { + drm_err(&xe->drm, "Skipping GuC Engine LRC snapshot.\n"); + } else { + for (i = 0; i < q->width; ++i) { + struct xe_lrc *lrc = q->lrc + i; + + snapshot->lrc[i].context_desc = + lower_32_bits(xe_lrc_ggtt_addr(lrc)); + snapshot->lrc[i].head = xe_lrc_ring_head(lrc); + snapshot->lrc[i].tail.internal = lrc->ring.tail; + snapshot->lrc[i].tail.memory = + xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL); + snapshot->lrc[i].start_seqno = xe_lrc_start_seqno(lrc); + snapshot->lrc[i].seqno = xe_lrc_seqno(lrc); + } + } + + snapshot->schedule_state = atomic_read(&q->guc->state); + snapshot->exec_queue_flags = q->flags; + + snapshot->parallel_execution = xe_exec_queue_is_parallel(q); + if (snapshot->parallel_execution) + guc_exec_queue_wq_snapshot_capture(q, snapshot); + + spin_lock(&sched->base.job_list_lock); + snapshot->pending_list_size = list_count_nodes(&sched->base.pending_list); + snapshot->pending_list = kmalloc_array(snapshot->pending_list_size, + sizeof(struct pending_list_snapshot), + GFP_ATOMIC); + + if (!snapshot->pending_list) { + drm_err(&xe->drm, "Skipping GuC Engine pending_list snapshot.\n"); + } else { + i = 0; + list_for_each_entry(job, &sched->base.pending_list, drm.list) { + snapshot->pending_list[i].seqno = + xe_sched_job_seqno(job); + snapshot->pending_list[i].fence = + dma_fence_is_signaled(job->fence) ? 1 : 0; + snapshot->pending_list[i].finished = + dma_fence_is_signaled(&job->drm.s_fence->finished) + ? 1 : 0; + i++; + } + } + + spin_unlock(&sched->base.job_list_lock); + + return snapshot; +} + +/** + * xe_guc_exec_queue_snapshot_print - Print out a given GuC Engine snapshot. + * @snapshot: GuC Submit Engine snapshot object. + * @p: drm_printer where it will be printed out. + * + * This function prints out a given GuC Submit Engine snapshot object. + */ +void +xe_guc_exec_queue_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snapshot, + struct drm_printer *p) +{ + int i; + + if (!snapshot) + return; + + drm_printf(p, "\nGuC ID: %d\n", snapshot->guc.id); + drm_printf(p, "\tName: %s\n", snapshot->name); + drm_printf(p, "\tClass: %d\n", snapshot->class); + drm_printf(p, "\tLogical mask: 0x%x\n", snapshot->logical_mask); + drm_printf(p, "\tWidth: %d\n", snapshot->width); + drm_printf(p, "\tRef: %d\n", snapshot->refcount); + drm_printf(p, "\tTimeout: %ld (ms)\n", snapshot->sched_timeout); + drm_printf(p, "\tTimeslice: %u (us)\n", + snapshot->sched_props.timeslice_us); + drm_printf(p, "\tPreempt timeout: %u (us)\n", + snapshot->sched_props.preempt_timeout_us); + + for (i = 0; snapshot->lrc && i < snapshot->width; ++i) { + drm_printf(p, "\tHW Context Desc: 0x%08x\n", + snapshot->lrc[i].context_desc); + drm_printf(p, "\tLRC Head: (memory) %u\n", + snapshot->lrc[i].head); + drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", + snapshot->lrc[i].tail.internal, + snapshot->lrc[i].tail.memory); + drm_printf(p, "\tStart seqno: (memory) %d\n", + snapshot->lrc[i].start_seqno); + drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->lrc[i].seqno); + } + drm_printf(p, "\tSchedule State: 0x%x\n", snapshot->schedule_state); + drm_printf(p, "\tFlags: 0x%lx\n", snapshot->exec_queue_flags); + + if (snapshot->parallel_execution) + guc_exec_queue_wq_snapshot_print(snapshot, p); + + for (i = 0; snapshot->pending_list && i < snapshot->pending_list_size; + i++) + drm_printf(p, "\tJob: seqno=%d, fence=%d, finished=%d\n", + snapshot->pending_list[i].seqno, + snapshot->pending_list[i].fence, + snapshot->pending_list[i].finished); +} + +/** + * xe_guc_exec_queue_snapshot_free - Free all allocated objects for a given + * snapshot. + * @snapshot: GuC Submit Engine snapshot object. + * + * This function free all the memory that needed to be allocated at capture + * time. + */ +void xe_guc_exec_queue_snapshot_free(struct xe_guc_submit_exec_queue_snapshot *snapshot) +{ + if (!snapshot) + return; + + kfree(snapshot->lrc); + kfree(snapshot->pending_list); + kfree(snapshot); +} + +static void guc_exec_queue_print(struct xe_exec_queue *q, struct drm_printer *p) +{ + struct xe_guc_submit_exec_queue_snapshot *snapshot; + + snapshot = xe_guc_exec_queue_snapshot_capture(q); + xe_guc_exec_queue_snapshot_print(snapshot, p); + xe_guc_exec_queue_snapshot_free(snapshot); +} + +/** + * xe_guc_submit_print - GuC Submit Print. + * @guc: GuC. + * @p: drm_printer where it will be printed out. + * + * This function capture and prints snapshots of **all** GuC Engines. + */ +void xe_guc_submit_print(struct xe_guc *guc, struct drm_printer *p) +{ + struct xe_exec_queue *q; + unsigned long index; + + if (!xe_device_uc_enabled(guc_to_xe(guc))) + return; + + mutex_lock(&guc->submission_state.lock); + xa_for_each(&guc->submission_state.exec_queue_lookup, index, q) + guc_exec_queue_print(q, p); + mutex_unlock(&guc->submission_state.lock); +} diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h new file mode 100644 index 000000000000..fc97869c5b86 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_submit.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_SUBMIT_H_ +#define _XE_GUC_SUBMIT_H_ + +#include <linux/types.h> + +struct drm_printer; +struct xe_exec_queue; +struct xe_guc; + +int xe_guc_submit_init(struct xe_guc *guc); + +int xe_guc_submit_reset_prepare(struct xe_guc *guc); +void xe_guc_submit_reset_wait(struct xe_guc *guc); +int xe_guc_submit_stop(struct xe_guc *guc); +int xe_guc_submit_start(struct xe_guc *guc); + +int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32 len); +int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len); +int xe_guc_exec_queue_reset_handler(struct xe_guc *guc, u32 *msg, u32 len); +int xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg, + u32 len); +int xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32 len); + +struct xe_guc_submit_exec_queue_snapshot * +xe_guc_exec_queue_snapshot_capture(struct xe_exec_queue *q); +void +xe_guc_exec_queue_snapshot_print(struct xe_guc_submit_exec_queue_snapshot *snapshot, + struct drm_printer *p); +void +xe_guc_exec_queue_snapshot_free(struct xe_guc_submit_exec_queue_snapshot *snapshot); +void xe_guc_submit_print(struct xe_guc *guc, struct drm_printer *p); + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_submit_types.h b/drivers/gpu/drm/xe/xe_guc_submit_types.h new file mode 100644 index 000000000000..649b0a852692 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_submit_types.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_GUC_SUBMIT_TYPES_H_ +#define _XE_GUC_SUBMIT_TYPES_H_ + +#include "xe_hw_engine_types.h" + +/* Work item for submitting workloads into work queue of GuC. */ +#define WQ_STATUS_ACTIVE 1 +#define WQ_STATUS_SUSPENDED 2 +#define WQ_STATUS_CMD_ERROR 3 +#define WQ_STATUS_ENGINE_ID_NOT_USED 4 +#define WQ_STATUS_SUSPENDED_FROM_RESET 5 +#define WQ_TYPE_NOOP 0x4 +#define WQ_TYPE_MULTI_LRC 0x5 +#define WQ_TYPE_MASK GENMASK(7, 0) +#define WQ_LEN_MASK GENMASK(26, 16) + +#define WQ_GUC_ID_MASK GENMASK(15, 0) +#define WQ_RING_TAIL_MASK GENMASK(28, 18) + +#define PARALLEL_SCRATCH_SIZE 2048 +#define WQ_SIZE (PARALLEL_SCRATCH_SIZE / 2) +#define WQ_OFFSET (PARALLEL_SCRATCH_SIZE - WQ_SIZE) +#define CACHELINE_BYTES 64 + +struct guc_sched_wq_desc { + u32 head; + u32 tail; + u32 error_offset; + u32 wq_status; + u32 reserved[28]; +} __packed; + +struct sync_semaphore { + u32 semaphore; + u8 unused[CACHELINE_BYTES - sizeof(u32)]; +}; + +/** + * struct guc_submit_parallel_scratch - A scratch shared mapped buffer. + */ +struct guc_submit_parallel_scratch { + /** @wq_desc: Guc scheduler workqueue descriptor */ + struct guc_sched_wq_desc wq_desc; + + /** @go: Go Semaphore */ + struct sync_semaphore go; + /** @join: Joined semaphore for the relevant hw engine instances */ + struct sync_semaphore join[XE_HW_ENGINE_MAX_INSTANCE]; + + /** @unused: Unused/Reserved memory space */ + u8 unused[WQ_OFFSET - sizeof(struct guc_sched_wq_desc) - + sizeof(struct sync_semaphore) * + (XE_HW_ENGINE_MAX_INSTANCE + 1)]; + + /** @wq: Workqueue info */ + u32 wq[WQ_SIZE / sizeof(u32)]; +}; + +struct lrc_snapshot { + u32 context_desc; + u32 head; + struct { + u32 internal; + u32 memory; + } tail; + u32 start_seqno; + u32 seqno; +}; + +struct pending_list_snapshot { + u32 seqno; + bool fence; + bool finished; +}; + +/** + * struct xe_guc_submit_exec_queue_snapshot - Snapshot for devcoredump + */ +struct xe_guc_submit_exec_queue_snapshot { + /** @name: name of this exec queue */ + char name[MAX_FENCE_NAME_LEN]; + /** @class: class of this exec queue */ + enum xe_engine_class class; + /** + * @logical_mask: logical mask of where job submitted to exec queue can run + */ + u32 logical_mask; + /** @width: width (number BB submitted per exec) of this exec queue */ + u16 width; + /** @refcount: ref count of this exec queue */ + u32 refcount; + /** + * @sched_timeout: the time after which a job is removed from the + * scheduler. + */ + long sched_timeout; + + /** @sched_props: scheduling properties */ + struct { + /** @timeslice_us: timeslice period in micro-seconds */ + u32 timeslice_us; + /** @preempt_timeout_us: preemption timeout in micro-seconds */ + u32 preempt_timeout_us; + } sched_props; + + /** @lrc: LRC Snapshot */ + struct lrc_snapshot *lrc; + + /** @schedule_state: Schedule State at the moment of Crash */ + u32 schedule_state; + /** @exec_queue_flags: Flags of the faulty exec_queue */ + unsigned long exec_queue_flags; + + /** @guc: GuC Engine Snapshot */ + struct { + /** @wqi_head: work queue item head */ + u32 wqi_head; + /** @wqi_tail: work queue item tail */ + u32 wqi_tail; + /** @id: GuC id for this exec_queue */ + u16 id; + } guc; + + /** + * @parallel_execution: Indication if the failure was during parallel + * execution + */ + bool parallel_execution; + /** @parallel: snapshot of the useful parallel scratch */ + struct { + /** @wq_desc: Workqueue description */ + struct { + /** @head: Workqueue Head */ + u32 head; + /** @tail: Workqueue Tail */ + u32 tail; + /** @status: Workqueue Status */ + u32 status; + } wq_desc; + /** @wq: Workqueue Items */ + u32 wq[WQ_SIZE / sizeof(u32)]; + } parallel; + + /** @pending_list_size: Size of the pending list snapshot array */ + int pending_list_size; + /** @pending_list: snapshot of the pending list info */ + struct pending_list_snapshot *pending_list; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_guc_types.h b/drivers/gpu/drm/xe/xe_guc_types.h new file mode 100644 index 000000000000..cd80802e8918 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_guc_types.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_GUC_TYPES_H_ +#define _XE_GUC_TYPES_H_ + +#include <linux/idr.h> +#include <linux/xarray.h> + +#include "regs/xe_reg_defs.h" +#include "xe_guc_ads_types.h" +#include "xe_guc_ct_types.h" +#include "xe_guc_fwif.h" +#include "xe_guc_log_types.h" +#include "xe_guc_pc_types.h" +#include "xe_uc_fw_types.h" + +/** + * struct xe_guc - Graphic micro controller + */ +struct xe_guc { + /** @fw: Generic uC firmware management */ + struct xe_uc_fw fw; + /** @log: GuC log */ + struct xe_guc_log log; + /** @ads: GuC ads */ + struct xe_guc_ads ads; + /** @ct: GuC ct */ + struct xe_guc_ct ct; + /** @pc: GuC Power Conservation */ + struct xe_guc_pc pc; + /** @submission_state: GuC submission state */ + struct { + /** @exec_queue_lookup: Lookup an xe_engine from guc_id */ + struct xarray exec_queue_lookup; + /** @guc_ids: used to allocate new guc_ids, single-lrc */ + struct ida guc_ids; + /** @guc_ids_bitmap: used to allocate new guc_ids, multi-lrc */ + unsigned long *guc_ids_bitmap; + /** @stopped: submissions are stopped */ + atomic_t stopped; + /** @lock: protects submission state */ + struct mutex lock; + /** @suspend: suspend fence state */ + struct { + /** @lock: suspend fences lock */ + spinlock_t lock; + /** @context: suspend fences context */ + u64 context; + /** @seqno: suspend fences seqno */ + u32 seqno; + } suspend; +#ifdef CONFIG_PROVE_LOCKING +#define NUM_SUBMIT_WQ 256 + /** @submit_wq_pool: submission ordered workqueues pool */ + struct workqueue_struct *submit_wq_pool[NUM_SUBMIT_WQ]; + /** @submit_wq_idx: submission ordered workqueue index */ + int submit_wq_idx; +#endif + /** @enabled: submission is enabled */ + bool enabled; + } submission_state; + /** @hwconfig: Hardware config state */ + struct { + /** @bo: buffer object of the hardware config */ + struct xe_bo *bo; + /** @size: size of the hardware config */ + u32 size; + } hwconfig; + + /** + * @notify_reg: Register which is written to notify GuC of H2G messages + */ + struct xe_reg notify_reg; + /** @params: Control params for fw initialization */ + u32 params[GUC_CTL_MAX_DWORDS]; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.c b/drivers/gpu/drm/xe/xe_heci_gsc.c new file mode 100644 index 000000000000..bfdd33b9b23b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_heci_gsc.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright(c) 2023, Intel Corporation. All rights reserved. + */ + +#include <linux/irq.h> +#include <linux/mei_aux.h> +#include <linux/pci.h> +#include <linux/sizes.h> + +#include "xe_device_types.h" +#include "xe_drv.h" +#include "xe_heci_gsc.h" +#include "xe_platform_types.h" + +#define GSC_BAR_LENGTH 0x00000FFC + +#define DG1_GSC_HECI2_BASE 0x259000 +#define PVC_GSC_HECI2_BASE 0x285000 +#define DG2_GSC_HECI2_BASE 0x374000 + +static void heci_gsc_irq_mask(struct irq_data *d) +{ + /* generic irq handling */ +} + +static void heci_gsc_irq_unmask(struct irq_data *d) +{ + /* generic irq handling */ +} + +static struct irq_chip heci_gsc_irq_chip = { + .name = "gsc_irq_chip", + .irq_mask = heci_gsc_irq_mask, + .irq_unmask = heci_gsc_irq_unmask, +}; + +static int heci_gsc_irq_init(int irq) +{ + irq_set_chip_and_handler_name(irq, &heci_gsc_irq_chip, + handle_simple_irq, "heci_gsc_irq_handler"); + + return irq_set_chip_data(irq, NULL); +} + +/** + * struct heci_gsc_def - graphics security controller heci interface definitions + * + * @name: name of the heci device + * @bar: address of the mmio bar + * @bar_size: size of the mmio bar + * @use_polling: indication of using polling mode for the device + * @slow_firmware: indication of whether the device is slow (needs longer timeouts) + */ +struct heci_gsc_def { + const char *name; + unsigned long bar; + size_t bar_size; + bool use_polling; + bool slow_firmware; +}; + +/* gsc resources and definitions */ +static const struct heci_gsc_def heci_gsc_def_dg1 = { + .name = "mei-gscfi", + .bar = DG1_GSC_HECI2_BASE, + .bar_size = GSC_BAR_LENGTH, +}; + +static const struct heci_gsc_def heci_gsc_def_dg2 = { + .name = "mei-gscfi", + .bar = DG2_GSC_HECI2_BASE, + .bar_size = GSC_BAR_LENGTH, +}; + +static const struct heci_gsc_def heci_gsc_def_pvc = { + .name = "mei-gscfi", + .bar = PVC_GSC_HECI2_BASE, + .bar_size = GSC_BAR_LENGTH, + .slow_firmware = true, +}; + +static void heci_gsc_release_dev(struct device *dev) +{ + struct auxiliary_device *aux_dev = to_auxiliary_dev(dev); + struct mei_aux_device *adev = auxiliary_dev_to_mei_aux_dev(aux_dev); + + kfree(adev); +} + +void xe_heci_gsc_fini(struct xe_device *xe) +{ + struct xe_heci_gsc *heci_gsc = &xe->heci_gsc; + + if (!HAS_HECI_GSCFI(xe)) + return; + + if (heci_gsc->adev) { + struct auxiliary_device *aux_dev = &heci_gsc->adev->aux_dev; + + auxiliary_device_delete(aux_dev); + auxiliary_device_uninit(aux_dev); + heci_gsc->adev = NULL; + } + + if (heci_gsc->irq >= 0) + irq_free_desc(heci_gsc->irq); + heci_gsc->irq = -1; +} + +static int heci_gsc_irq_setup(struct xe_device *xe) +{ + struct xe_heci_gsc *heci_gsc = &xe->heci_gsc; + int ret; + + heci_gsc->irq = irq_alloc_desc(0); + if (heci_gsc->irq < 0) { + drm_err(&xe->drm, "gsc irq error %d\n", heci_gsc->irq); + return heci_gsc->irq; + } + + ret = heci_gsc_irq_init(heci_gsc->irq); + if (ret < 0) + drm_err(&xe->drm, "gsc irq init failed %d\n", ret); + + return ret; +} + +static int heci_gsc_add_device(struct xe_device *xe, const struct heci_gsc_def *def) +{ + struct xe_heci_gsc *heci_gsc = &xe->heci_gsc; + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct auxiliary_device *aux_dev; + struct mei_aux_device *adev; + int ret; + + adev = kzalloc(sizeof(*adev), GFP_KERNEL); + if (!adev) + return -ENOMEM; + adev->irq = heci_gsc->irq; + adev->bar.parent = &pdev->resource[0]; + adev->bar.start = def->bar + pdev->resource[0].start; + adev->bar.end = adev->bar.start + def->bar_size - 1; + adev->bar.flags = IORESOURCE_MEM; + adev->bar.desc = IORES_DESC_NONE; + adev->slow_firmware = def->slow_firmware; + + aux_dev = &adev->aux_dev; + aux_dev->name = def->name; + aux_dev->id = (pci_domain_nr(pdev->bus) << 16) | + PCI_DEVID(pdev->bus->number, pdev->devfn); + aux_dev->dev.parent = &pdev->dev; + aux_dev->dev.release = heci_gsc_release_dev; + + ret = auxiliary_device_init(aux_dev); + if (ret < 0) { + drm_err(&xe->drm, "gsc aux init failed %d\n", ret); + kfree(adev); + return ret; + } + + heci_gsc->adev = adev; /* needed by the notifier */ + ret = auxiliary_device_add(aux_dev); + if (ret < 0) { + drm_err(&xe->drm, "gsc aux add failed %d\n", ret); + heci_gsc->adev = NULL; + + /* adev will be freed with the put_device() and .release sequence */ + auxiliary_device_uninit(aux_dev); + } + return ret; +} + +void xe_heci_gsc_init(struct xe_device *xe) +{ + struct xe_heci_gsc *heci_gsc = &xe->heci_gsc; + const struct heci_gsc_def *def; + int ret; + + if (!HAS_HECI_GSCFI(xe)) + return; + + heci_gsc->irq = -1; + + if (xe->info.platform == XE_PVC) { + def = &heci_gsc_def_pvc; + } else if (xe->info.platform == XE_DG2) { + def = &heci_gsc_def_dg2; + } else if (xe->info.platform == XE_DG1) { + def = &heci_gsc_def_dg1; + } else { + drm_warn_once(&xe->drm, "Unknown platform\n"); + return; + } + + if (!def->name) { + drm_warn_once(&xe->drm, "HECI is not implemented!\n"); + return; + } + + if (!def->use_polling) { + ret = heci_gsc_irq_setup(xe); + if (ret) + goto fail; + } + + ret = heci_gsc_add_device(xe, def); + if (ret) + goto fail; + + return; +fail: + xe_heci_gsc_fini(xe); +} + +void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir) +{ + int ret; + + if ((iir & GSC_IRQ_INTF(1)) == 0) + return; + + if (!HAS_HECI_GSCFI(xe)) { + drm_warn_once(&xe->drm, "GSC irq: not supported"); + return; + } + + if (xe->heci_gsc.irq < 0) + return; + + ret = generic_handle_irq(xe->heci_gsc.irq); + if (ret) + drm_err_ratelimited(&xe->drm, "error handling GSC irq: %d\n", ret); +} diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.h b/drivers/gpu/drm/xe/xe_heci_gsc.h new file mode 100644 index 000000000000..9db454478fae --- /dev/null +++ b/drivers/gpu/drm/xe/xe_heci_gsc.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright(c) 2023, Intel Corporation. All rights reserved. + */ +#ifndef __XE_HECI_GSC_DEV_H__ +#define __XE_HECI_GSC_DEV_H__ + +#include <linux/types.h> + +struct xe_device; +struct mei_aux_device; + +/* + * The HECI1 bit corresponds to bit15 and HECI2 to bit14. + * The reason for this is to allow growth for more interfaces in the future. + */ +#define GSC_IRQ_INTF(_x) BIT(15 - (_x)) + +/** + * struct xe_heci_gsc - graphics security controller for xe, HECI interface + * + * @adev : pointer to mei auxiliary device structure + * @irq : irq number + * + */ +struct xe_heci_gsc { + struct mei_aux_device *adev; + int irq; +}; + +void xe_heci_gsc_init(struct xe_device *xe); +void xe_heci_gsc_fini(struct xe_device *xe); +void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir); + +#endif /* __XE_HECI_GSC_DEV_H__ */ diff --git a/drivers/gpu/drm/xe/xe_huc.c b/drivers/gpu/drm/xe/xe_huc.c new file mode 100644 index 000000000000..eca109791c6a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_huc.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_huc.h" + +#include <drm/drm_managed.h> + +#include "abi/gsc_pxp_commands_abi.h" +#include "regs/xe_gsc_regs.h" +#include "regs/xe_guc_regs.h" +#include "xe_assert.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_force_wake.h" +#include "xe_gsc_submit.h" +#include "xe_gt.h" +#include "xe_guc.h" +#include "xe_map.h" +#include "xe_mmio.h" +#include "xe_uc_fw.h" + +static struct xe_gt * +huc_to_gt(struct xe_huc *huc) +{ + return container_of(huc, struct xe_gt, uc.huc); +} + +static struct xe_device * +huc_to_xe(struct xe_huc *huc) +{ + return gt_to_xe(huc_to_gt(huc)); +} + +static struct xe_guc * +huc_to_guc(struct xe_huc *huc) +{ + return &container_of(huc, struct xe_uc, huc)->guc; +} + +static void free_gsc_pkt(struct drm_device *drm, void *arg) +{ + struct xe_huc *huc = arg; + + xe_bo_unpin_map_no_vm(huc->gsc_pkt); + huc->gsc_pkt = NULL; +} + +#define PXP43_HUC_AUTH_INOUT_SIZE SZ_4K +static int huc_alloc_gsc_pkt(struct xe_huc *huc) +{ + struct xe_gt *gt = huc_to_gt(huc); + struct xe_device *xe = gt_to_xe(gt); + struct xe_bo *bo; + int err; + + /* we use a single object for both input and output */ + bo = xe_bo_create_pin_map(xe, gt_to_tile(gt), NULL, + PXP43_HUC_AUTH_INOUT_SIZE * 2, + ttm_bo_type_kernel, + XE_BO_CREATE_SYSTEM_BIT | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + huc->gsc_pkt = bo; + + err = drmm_add_action_or_reset(&xe->drm, free_gsc_pkt, huc); + if (err) { + free_gsc_pkt(&xe->drm, huc); + return err; + } + + return 0; +} + +int xe_huc_init(struct xe_huc *huc) +{ + struct xe_gt *gt = huc_to_gt(huc); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_device *xe = gt_to_xe(gt); + int ret; + + huc->fw.type = XE_UC_FW_TYPE_HUC; + + /* On platforms with a media GT the HuC is only available there */ + if (tile->media_gt && (gt != tile->media_gt)) { + xe_uc_fw_change_status(&huc->fw, XE_UC_FIRMWARE_NOT_SUPPORTED); + return 0; + } + + ret = xe_uc_fw_init(&huc->fw); + if (ret) + goto out; + + if (!xe_uc_fw_is_enabled(&huc->fw)) + return 0; + + if (huc->fw.has_gsc_headers) { + ret = huc_alloc_gsc_pkt(huc); + if (ret) + goto out; + } + + xe_uc_fw_change_status(&huc->fw, XE_UC_FIRMWARE_LOADABLE); + + return 0; + +out: + drm_err(&xe->drm, "HuC init failed with %d", ret); + return ret; +} + +int xe_huc_upload(struct xe_huc *huc) +{ + if (!xe_uc_fw_is_loadable(&huc->fw)) + return 0; + return xe_uc_fw_upload(&huc->fw, 0, HUC_UKERNEL); +} + +#define huc_auth_msg_wr(xe_, map_, offset_, field_, val_) \ + xe_map_wr_field(xe_, map_, offset_, struct pxp43_new_huc_auth_in, field_, val_) +#define huc_auth_msg_rd(xe_, map_, offset_, field_) \ + xe_map_rd_field(xe_, map_, offset_, struct pxp43_huc_auth_out, field_) + +static u32 huc_emit_pxp_auth_msg(struct xe_device *xe, struct iosys_map *map, + u32 wr_offset, u32 huc_offset, u32 huc_size) +{ + xe_map_memset(xe, map, wr_offset, 0, sizeof(struct pxp43_new_huc_auth_in)); + + huc_auth_msg_wr(xe, map, wr_offset, header.api_version, PXP_APIVER(4, 3)); + huc_auth_msg_wr(xe, map, wr_offset, header.command_id, PXP43_CMDID_NEW_HUC_AUTH); + huc_auth_msg_wr(xe, map, wr_offset, header.status, 0); + huc_auth_msg_wr(xe, map, wr_offset, header.buffer_len, + sizeof(struct pxp43_new_huc_auth_in) - sizeof(struct pxp_cmd_header)); + huc_auth_msg_wr(xe, map, wr_offset, huc_base_address, huc_offset); + huc_auth_msg_wr(xe, map, wr_offset, huc_size, huc_size); + + return wr_offset + sizeof(struct pxp43_new_huc_auth_in); +} + +static int huc_auth_via_gsccs(struct xe_huc *huc) +{ + struct xe_gt *gt = huc_to_gt(huc); + struct xe_device *xe = gt_to_xe(gt); + struct xe_bo *pkt = huc->gsc_pkt; + u32 wr_offset; + u32 rd_offset; + u64 ggtt_offset; + u32 out_status; + int retry = 5; + int err = 0; + + if (!pkt) + return -ENODEV; + + ggtt_offset = xe_bo_ggtt_addr(pkt); + + wr_offset = xe_gsc_emit_header(xe, &pkt->vmap, 0, HECI_MEADDRESS_PXP, 0, + sizeof(struct pxp43_new_huc_auth_in)); + wr_offset = huc_emit_pxp_auth_msg(xe, &pkt->vmap, wr_offset, + xe_bo_ggtt_addr(huc->fw.bo), + huc->fw.bo->size); + do { + err = xe_gsc_pkt_submit_kernel(>->uc.gsc, ggtt_offset, wr_offset, + ggtt_offset + PXP43_HUC_AUTH_INOUT_SIZE, + PXP43_HUC_AUTH_INOUT_SIZE); + if (err) + break; + + if (xe_gsc_check_and_update_pending(xe, &pkt->vmap, 0, &pkt->vmap, + PXP43_HUC_AUTH_INOUT_SIZE)) { + err = -EBUSY; + msleep(50); + } + } while (--retry && err == -EBUSY); + + if (err) { + drm_err(&xe->drm, "failed to submit GSC request to auth: %d\n", err); + return err; + } + + err = xe_gsc_read_out_header(xe, &pkt->vmap, PXP43_HUC_AUTH_INOUT_SIZE, + sizeof(struct pxp43_huc_auth_out), &rd_offset); + if (err) { + drm_err(&xe->drm, "HuC: invalid GSC reply for auth (err=%d)\n", err); + return err; + } + + /* + * The GSC will return PXP_STATUS_OP_NOT_PERMITTED if the HuC is already + * authenticated. If the same error is ever returned with HuC not loaded + * we'll still catch it when we check the authentication bit later. + */ + out_status = huc_auth_msg_rd(xe, &pkt->vmap, rd_offset, header.status); + if (out_status != PXP_STATUS_SUCCESS && out_status != PXP_STATUS_OP_NOT_PERMITTED) { + drm_err(&xe->drm, "auth failed with GSC error = 0x%x\n", out_status); + return -EIO; + } + + return 0; +} + +static const struct { + const char *name; + struct xe_reg reg; + u32 val; +} huc_auth_modes[XE_HUC_AUTH_TYPES_COUNT] = { + [XE_HUC_AUTH_VIA_GUC] = { "GuC", + HUC_KERNEL_LOAD_INFO, + HUC_LOAD_SUCCESSFUL }, + [XE_HUC_AUTH_VIA_GSC] = { "GSC", + HECI_FWSTS5(MTL_GSC_HECI1_BASE), + HECI1_FWSTS5_HUC_AUTH_DONE }, +}; + +bool xe_huc_is_authenticated(struct xe_huc *huc, enum xe_huc_auth_types type) +{ + struct xe_gt *gt = huc_to_gt(huc); + + return xe_mmio_read32(gt, huc_auth_modes[type].reg) & huc_auth_modes[type].val; +} + +int xe_huc_auth(struct xe_huc *huc, enum xe_huc_auth_types type) +{ + struct xe_device *xe = huc_to_xe(huc); + struct xe_gt *gt = huc_to_gt(huc); + struct xe_guc *guc = huc_to_guc(huc); + int ret; + + if (!xe_uc_fw_is_loadable(&huc->fw)) + return 0; + + /* On newer platforms the HuC survives reset, so no need to re-auth */ + if (xe_huc_is_authenticated(huc, type)) { + xe_uc_fw_change_status(&huc->fw, XE_UC_FIRMWARE_RUNNING); + return 0; + } + + if (!xe_uc_fw_is_loaded(&huc->fw)) + return -ENOEXEC; + + switch (type) { + case XE_HUC_AUTH_VIA_GUC: + ret = xe_guc_auth_huc(guc, xe_bo_ggtt_addr(huc->fw.bo) + + xe_uc_fw_rsa_offset(&huc->fw)); + break; + case XE_HUC_AUTH_VIA_GSC: + ret = huc_auth_via_gsccs(huc); + break; + default: + XE_WARN_ON(type); + return -EINVAL; + } + if (ret) { + drm_err(&xe->drm, "Failed to trigger HuC auth via %s: %d\n", + huc_auth_modes[type].name, ret); + goto fail; + } + + ret = xe_mmio_wait32(gt, huc_auth_modes[type].reg, huc_auth_modes[type].val, + huc_auth_modes[type].val, 100000, NULL, false); + if (ret) { + drm_err(&xe->drm, "HuC: Firmware not verified %d\n", ret); + goto fail; + } + + xe_uc_fw_change_status(&huc->fw, XE_UC_FIRMWARE_RUNNING); + drm_dbg(&xe->drm, "HuC authenticated via %s\n", huc_auth_modes[type].name); + + return 0; + +fail: + drm_err(&xe->drm, "HuC: Auth via %s failed: %d\n", + huc_auth_modes[type].name, ret); + xe_uc_fw_change_status(&huc->fw, XE_UC_FIRMWARE_LOAD_FAIL); + + return ret; +} + +void xe_huc_sanitize(struct xe_huc *huc) +{ + if (!xe_uc_fw_is_loadable(&huc->fw)) + return; + xe_uc_fw_change_status(&huc->fw, XE_UC_FIRMWARE_LOADABLE); +} + +void xe_huc_print_info(struct xe_huc *huc, struct drm_printer *p) +{ + struct xe_gt *gt = huc_to_gt(huc); + int err; + + xe_uc_fw_print(&huc->fw, p); + + if (!xe_uc_fw_is_enabled(&huc->fw)) + return; + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + return; + + drm_printf(p, "\nHuC status: 0x%08x\n", + xe_mmio_read32(gt, HUC_KERNEL_LOAD_INFO)); + + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +} diff --git a/drivers/gpu/drm/xe/xe_huc.h b/drivers/gpu/drm/xe/xe_huc.h new file mode 100644 index 000000000000..532017230287 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_huc.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_HUC_H_ +#define _XE_HUC_H_ + +#include "xe_huc_types.h" + +struct drm_printer; + +enum xe_huc_auth_types { + XE_HUC_AUTH_VIA_GUC = 0, + XE_HUC_AUTH_VIA_GSC, + XE_HUC_AUTH_TYPES_COUNT +}; + +int xe_huc_init(struct xe_huc *huc); +int xe_huc_upload(struct xe_huc *huc); +int xe_huc_auth(struct xe_huc *huc, enum xe_huc_auth_types type); +bool xe_huc_is_authenticated(struct xe_huc *huc, enum xe_huc_auth_types type); +void xe_huc_sanitize(struct xe_huc *huc); +void xe_huc_print_info(struct xe_huc *huc, struct drm_printer *p); + +#endif diff --git a/drivers/gpu/drm/xe/xe_huc_debugfs.c b/drivers/gpu/drm/xe/xe_huc_debugfs.c new file mode 100644 index 000000000000..18585a7eeb9d --- /dev/null +++ b/drivers/gpu/drm/xe/xe_huc_debugfs.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_huc_debugfs.h" + +#include <drm/drm_debugfs.h> +#include <drm/drm_managed.h> + +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_huc.h" +#include "xe_macros.h" + +static struct xe_gt * +huc_to_gt(struct xe_huc *huc) +{ + return container_of(huc, struct xe_gt, uc.huc); +} + +static struct xe_device * +huc_to_xe(struct xe_huc *huc) +{ + return gt_to_xe(huc_to_gt(huc)); +} + +static struct xe_huc *node_to_huc(struct drm_info_node *node) +{ + return node->info_ent->data; +} + +static int huc_info(struct seq_file *m, void *data) +{ + struct xe_huc *huc = node_to_huc(m->private); + struct xe_device *xe = huc_to_xe(huc); + struct drm_printer p = drm_seq_file_printer(m); + + xe_device_mem_access_get(xe); + xe_huc_print_info(huc, &p); + xe_device_mem_access_put(xe); + + return 0; +} + +static const struct drm_info_list debugfs_list[] = { + {"huc_info", huc_info, 0}, +}; + +void xe_huc_debugfs_register(struct xe_huc *huc, struct dentry *parent) +{ + struct drm_minor *minor = huc_to_xe(huc)->drm.primary; + struct drm_info_list *local; + int i; + +#define DEBUGFS_SIZE (ARRAY_SIZE(debugfs_list) * sizeof(struct drm_info_list)) + local = drmm_kmalloc(&huc_to_xe(huc)->drm, DEBUGFS_SIZE, GFP_KERNEL); + if (!local) + return; + + memcpy(local, debugfs_list, DEBUGFS_SIZE); +#undef DEBUGFS_SIZE + + for (i = 0; i < ARRAY_SIZE(debugfs_list); ++i) + local[i].data = huc; + + drm_debugfs_create_files(local, + ARRAY_SIZE(debugfs_list), + parent, minor); +} diff --git a/drivers/gpu/drm/xe/xe_huc_debugfs.h b/drivers/gpu/drm/xe/xe_huc_debugfs.h new file mode 100644 index 000000000000..ec58f1818804 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_huc_debugfs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_HUC_DEBUGFS_H_ +#define _XE_HUC_DEBUGFS_H_ + +struct dentry; +struct xe_huc; + +void xe_huc_debugfs_register(struct xe_huc *huc, struct dentry *parent); + +#endif diff --git a/drivers/gpu/drm/xe/xe_huc_types.h b/drivers/gpu/drm/xe/xe_huc_types.h new file mode 100644 index 000000000000..cfbaa5e0dfca --- /dev/null +++ b/drivers/gpu/drm/xe/xe_huc_types.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_HUC_TYPES_H_ +#define _XE_HUC_TYPES_H_ + +#include "xe_uc_fw_types.h" + +struct xe_bo; + +/** + * struct xe_huc - HuC + */ +struct xe_huc { + /** @fw: Generic uC firmware management */ + struct xe_uc_fw fw; + + /** @gsc_pkt: bo to store the packet for auth via GSC */ + struct xe_bo *gsc_pkt; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_hw_engine.c b/drivers/gpu/drm/xe/xe_hw_engine.c new file mode 100644 index 000000000000..1fa5cf5eea97 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_engine.c @@ -0,0 +1,883 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_hw_engine.h" + +#include <drm/drm_managed.h> + +#include "regs/xe_engine_regs.h" +#include "regs/xe_gt_regs.h" +#include "xe_assert.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_execlist.h" +#include "xe_force_wake.h" +#include "xe_gt.h" +#include "xe_gt_ccs_mode.h" +#include "xe_gt_topology.h" +#include "xe_hw_fence.h" +#include "xe_irq.h" +#include "xe_lrc.h" +#include "xe_macros.h" +#include "xe_mmio.h" +#include "xe_reg_sr.h" +#include "xe_rtp.h" +#include "xe_sched_job.h" +#include "xe_tuning.h" +#include "xe_uc_fw.h" +#include "xe_wa.h" + +#define MAX_MMIO_BASES 3 +struct engine_info { + const char *name; + unsigned int class : 8; + unsigned int instance : 8; + enum xe_force_wake_domains domain; + u32 mmio_base; +}; + +static const struct engine_info engine_infos[] = { + [XE_HW_ENGINE_RCS0] = { + .name = "rcs0", + .class = XE_ENGINE_CLASS_RENDER, + .instance = 0, + .domain = XE_FW_RENDER, + .mmio_base = RENDER_RING_BASE, + }, + [XE_HW_ENGINE_BCS0] = { + .name = "bcs0", + .class = XE_ENGINE_CLASS_COPY, + .instance = 0, + .domain = XE_FW_RENDER, + .mmio_base = BLT_RING_BASE, + }, + [XE_HW_ENGINE_BCS1] = { + .name = "bcs1", + .class = XE_ENGINE_CLASS_COPY, + .instance = 1, + .domain = XE_FW_RENDER, + .mmio_base = XEHPC_BCS1_RING_BASE, + }, + [XE_HW_ENGINE_BCS2] = { + .name = "bcs2", + .class = XE_ENGINE_CLASS_COPY, + .instance = 2, + .domain = XE_FW_RENDER, + .mmio_base = XEHPC_BCS2_RING_BASE, + }, + [XE_HW_ENGINE_BCS3] = { + .name = "bcs3", + .class = XE_ENGINE_CLASS_COPY, + .instance = 3, + .domain = XE_FW_RENDER, + .mmio_base = XEHPC_BCS3_RING_BASE, + }, + [XE_HW_ENGINE_BCS4] = { + .name = "bcs4", + .class = XE_ENGINE_CLASS_COPY, + .instance = 4, + .domain = XE_FW_RENDER, + .mmio_base = XEHPC_BCS4_RING_BASE, + }, + [XE_HW_ENGINE_BCS5] = { + .name = "bcs5", + .class = XE_ENGINE_CLASS_COPY, + .instance = 5, + .domain = XE_FW_RENDER, + .mmio_base = XEHPC_BCS5_RING_BASE, + }, + [XE_HW_ENGINE_BCS6] = { + .name = "bcs6", + .class = XE_ENGINE_CLASS_COPY, + .instance = 6, + .domain = XE_FW_RENDER, + .mmio_base = XEHPC_BCS6_RING_BASE, + }, + [XE_HW_ENGINE_BCS7] = { + .name = "bcs7", + .class = XE_ENGINE_CLASS_COPY, + .instance = 7, + .domain = XE_FW_RENDER, + .mmio_base = XEHPC_BCS7_RING_BASE, + }, + [XE_HW_ENGINE_BCS8] = { + .name = "bcs8", + .class = XE_ENGINE_CLASS_COPY, + .instance = 8, + .domain = XE_FW_RENDER, + .mmio_base = XEHPC_BCS8_RING_BASE, + }, + + [XE_HW_ENGINE_VCS0] = { + .name = "vcs0", + .class = XE_ENGINE_CLASS_VIDEO_DECODE, + .instance = 0, + .domain = XE_FW_MEDIA_VDBOX0, + .mmio_base = BSD_RING_BASE, + }, + [XE_HW_ENGINE_VCS1] = { + .name = "vcs1", + .class = XE_ENGINE_CLASS_VIDEO_DECODE, + .instance = 1, + .domain = XE_FW_MEDIA_VDBOX1, + .mmio_base = BSD2_RING_BASE, + }, + [XE_HW_ENGINE_VCS2] = { + .name = "vcs2", + .class = XE_ENGINE_CLASS_VIDEO_DECODE, + .instance = 2, + .domain = XE_FW_MEDIA_VDBOX2, + .mmio_base = BSD3_RING_BASE, + }, + [XE_HW_ENGINE_VCS3] = { + .name = "vcs3", + .class = XE_ENGINE_CLASS_VIDEO_DECODE, + .instance = 3, + .domain = XE_FW_MEDIA_VDBOX3, + .mmio_base = BSD4_RING_BASE, + }, + [XE_HW_ENGINE_VCS4] = { + .name = "vcs4", + .class = XE_ENGINE_CLASS_VIDEO_DECODE, + .instance = 4, + .domain = XE_FW_MEDIA_VDBOX4, + .mmio_base = XEHP_BSD5_RING_BASE, + }, + [XE_HW_ENGINE_VCS5] = { + .name = "vcs5", + .class = XE_ENGINE_CLASS_VIDEO_DECODE, + .instance = 5, + .domain = XE_FW_MEDIA_VDBOX5, + .mmio_base = XEHP_BSD6_RING_BASE, + }, + [XE_HW_ENGINE_VCS6] = { + .name = "vcs6", + .class = XE_ENGINE_CLASS_VIDEO_DECODE, + .instance = 6, + .domain = XE_FW_MEDIA_VDBOX6, + .mmio_base = XEHP_BSD7_RING_BASE, + }, + [XE_HW_ENGINE_VCS7] = { + .name = "vcs7", + .class = XE_ENGINE_CLASS_VIDEO_DECODE, + .instance = 7, + .domain = XE_FW_MEDIA_VDBOX7, + .mmio_base = XEHP_BSD8_RING_BASE, + }, + [XE_HW_ENGINE_VECS0] = { + .name = "vecs0", + .class = XE_ENGINE_CLASS_VIDEO_ENHANCE, + .instance = 0, + .domain = XE_FW_MEDIA_VEBOX0, + .mmio_base = VEBOX_RING_BASE, + }, + [XE_HW_ENGINE_VECS1] = { + .name = "vecs1", + .class = XE_ENGINE_CLASS_VIDEO_ENHANCE, + .instance = 1, + .domain = XE_FW_MEDIA_VEBOX1, + .mmio_base = VEBOX2_RING_BASE, + }, + [XE_HW_ENGINE_VECS2] = { + .name = "vecs2", + .class = XE_ENGINE_CLASS_VIDEO_ENHANCE, + .instance = 2, + .domain = XE_FW_MEDIA_VEBOX2, + .mmio_base = XEHP_VEBOX3_RING_BASE, + }, + [XE_HW_ENGINE_VECS3] = { + .name = "vecs3", + .class = XE_ENGINE_CLASS_VIDEO_ENHANCE, + .instance = 3, + .domain = XE_FW_MEDIA_VEBOX3, + .mmio_base = XEHP_VEBOX4_RING_BASE, + }, + [XE_HW_ENGINE_CCS0] = { + .name = "ccs0", + .class = XE_ENGINE_CLASS_COMPUTE, + .instance = 0, + .domain = XE_FW_RENDER, + .mmio_base = COMPUTE0_RING_BASE, + }, + [XE_HW_ENGINE_CCS1] = { + .name = "ccs1", + .class = XE_ENGINE_CLASS_COMPUTE, + .instance = 1, + .domain = XE_FW_RENDER, + .mmio_base = COMPUTE1_RING_BASE, + }, + [XE_HW_ENGINE_CCS2] = { + .name = "ccs2", + .class = XE_ENGINE_CLASS_COMPUTE, + .instance = 2, + .domain = XE_FW_RENDER, + .mmio_base = COMPUTE2_RING_BASE, + }, + [XE_HW_ENGINE_CCS3] = { + .name = "ccs3", + .class = XE_ENGINE_CLASS_COMPUTE, + .instance = 3, + .domain = XE_FW_RENDER, + .mmio_base = COMPUTE3_RING_BASE, + }, + [XE_HW_ENGINE_GSCCS0] = { + .name = "gsccs0", + .class = XE_ENGINE_CLASS_OTHER, + .instance = OTHER_GSC_INSTANCE, + .domain = XE_FW_GSC, + .mmio_base = GSCCS_RING_BASE, + }, +}; + +static void hw_engine_fini(struct drm_device *drm, void *arg) +{ + struct xe_hw_engine *hwe = arg; + + if (hwe->exl_port) + xe_execlist_port_destroy(hwe->exl_port); + xe_lrc_finish(&hwe->kernel_lrc); + + hwe->gt = NULL; +} + +static void hw_engine_mmio_write32(struct xe_hw_engine *hwe, struct xe_reg reg, + u32 val) +{ + xe_gt_assert(hwe->gt, !(reg.addr & hwe->mmio_base)); + xe_force_wake_assert_held(gt_to_fw(hwe->gt), hwe->domain); + + reg.addr += hwe->mmio_base; + + xe_mmio_write32(hwe->gt, reg, val); +} + +static u32 hw_engine_mmio_read32(struct xe_hw_engine *hwe, struct xe_reg reg) +{ + xe_gt_assert(hwe->gt, !(reg.addr & hwe->mmio_base)); + xe_force_wake_assert_held(gt_to_fw(hwe->gt), hwe->domain); + + reg.addr += hwe->mmio_base; + + return xe_mmio_read32(hwe->gt, reg); +} + +void xe_hw_engine_enable_ring(struct xe_hw_engine *hwe) +{ + u32 ccs_mask = + xe_hw_engine_mask_per_class(hwe->gt, XE_ENGINE_CLASS_COMPUTE); + + if (hwe->class == XE_ENGINE_CLASS_COMPUTE && ccs_mask) + xe_mmio_write32(hwe->gt, RCU_MODE, + _MASKED_BIT_ENABLE(RCU_MODE_CCS_ENABLE)); + + hw_engine_mmio_write32(hwe, RING_HWSTAM(0), ~0x0); + hw_engine_mmio_write32(hwe, RING_HWS_PGA(0), + xe_bo_ggtt_addr(hwe->hwsp)); + hw_engine_mmio_write32(hwe, RING_MODE(0), + _MASKED_BIT_ENABLE(GFX_DISABLE_LEGACY_MODE)); + hw_engine_mmio_write32(hwe, RING_MI_MODE(0), + _MASKED_BIT_DISABLE(STOP_RING)); + hw_engine_mmio_read32(hwe, RING_MI_MODE(0)); +} + +static bool xe_hw_engine_match_fixed_cslice_mode(const struct xe_gt *gt, + const struct xe_hw_engine *hwe) +{ + return xe_gt_ccs_mode_enabled(gt) && + xe_rtp_match_first_render_or_compute(gt, hwe); +} + +void +xe_hw_engine_setup_default_lrc_state(struct xe_hw_engine *hwe) +{ + struct xe_gt *gt = hwe->gt; + const u8 mocs_write_idx = gt->mocs.uc_index; + const u8 mocs_read_idx = gt->mocs.uc_index; + u32 blit_cctl_val = REG_FIELD_PREP(BLIT_CCTL_DST_MOCS_MASK, mocs_write_idx) | + REG_FIELD_PREP(BLIT_CCTL_SRC_MOCS_MASK, mocs_read_idx); + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe); + const struct xe_rtp_entry_sr lrc_was[] = { + /* + * Some blitter commands do not have a field for MOCS, those + * commands will use MOCS index pointed by BLIT_CCTL. + * BLIT_CCTL registers are needed to be programmed to un-cached. + */ + { XE_RTP_NAME("BLIT_CCTL_default_MOCS"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, XE_RTP_END_VERSION_UNDEFINED), + ENGINE_CLASS(COPY)), + XE_RTP_ACTIONS(FIELD_SET(BLIT_CCTL(0), + BLIT_CCTL_DST_MOCS_MASK | + BLIT_CCTL_SRC_MOCS_MASK, + blit_cctl_val, + XE_RTP_ACTION_FLAG(ENGINE_BASE))) + }, + /* Use Fixed slice CCS mode */ + { XE_RTP_NAME("RCU_MODE_FIXED_SLICE_CCS_MODE"), + XE_RTP_RULES(FUNC(xe_hw_engine_match_fixed_cslice_mode)), + XE_RTP_ACTIONS(FIELD_SET(RCU_MODE, RCU_MODE_FIXED_SLICE_CCS_MODE, + RCU_MODE_FIXED_SLICE_CCS_MODE)) + }, + {} + }; + + xe_rtp_process_to_sr(&ctx, lrc_was, &hwe->reg_lrc); +} + +static void +hw_engine_setup_default_state(struct xe_hw_engine *hwe) +{ + struct xe_gt *gt = hwe->gt; + struct xe_device *xe = gt_to_xe(gt); + /* + * RING_CMD_CCTL specifies the default MOCS entry that will be + * used by the command streamer when executing commands that + * don't have a way to explicitly specify a MOCS setting. + * The default should usually reference whichever MOCS entry + * corresponds to uncached behavior, although use of a WB cached + * entry is recommended by the spec in certain circumstances on + * specific platforms. + * Bspec: 72161 + */ + const u8 mocs_write_idx = gt->mocs.uc_index; + const u8 mocs_read_idx = hwe->class == XE_ENGINE_CLASS_COMPUTE && + (GRAPHICS_VER(xe) >= 20 || xe->info.platform == XE_PVC) ? + gt->mocs.wb_index : gt->mocs.uc_index; + u32 ring_cmd_cctl_val = REG_FIELD_PREP(CMD_CCTL_WRITE_OVERRIDE_MASK, mocs_write_idx) | + REG_FIELD_PREP(CMD_CCTL_READ_OVERRIDE_MASK, mocs_read_idx); + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe); + const struct xe_rtp_entry_sr engine_entries[] = { + { XE_RTP_NAME("RING_CMD_CCTL_default_MOCS"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, XE_RTP_END_VERSION_UNDEFINED)), + XE_RTP_ACTIONS(FIELD_SET(RING_CMD_CCTL(0), + CMD_CCTL_WRITE_OVERRIDE_MASK | + CMD_CCTL_READ_OVERRIDE_MASK, + ring_cmd_cctl_val, + XE_RTP_ACTION_FLAG(ENGINE_BASE))) + }, + /* + * To allow the GSC engine to go idle on MTL we need to enable + * idle messaging and set the hysteresis value (we use 0xA=5us + * as recommended in spec). On platforms after MTL this is + * enabled by default. + */ + { XE_RTP_NAME("MTL GSCCS IDLE MSG enable"), + XE_RTP_RULES(MEDIA_VERSION(1300), ENGINE_CLASS(OTHER)), + XE_RTP_ACTIONS(CLR(RING_PSMI_CTL(0), + IDLE_MSG_DISABLE, + XE_RTP_ACTION_FLAG(ENGINE_BASE)), + FIELD_SET(RING_PWRCTX_MAXCNT(0), + IDLE_WAIT_TIME, + 0xA, + XE_RTP_ACTION_FLAG(ENGINE_BASE))) + }, + {} + }; + + xe_rtp_process_to_sr(&ctx, engine_entries, &hwe->reg_sr); +} + +static void hw_engine_init_early(struct xe_gt *gt, struct xe_hw_engine *hwe, + enum xe_hw_engine_id id) +{ + const struct engine_info *info; + + if (WARN_ON(id >= ARRAY_SIZE(engine_infos) || !engine_infos[id].name)) + return; + + if (!(gt->info.engine_mask & BIT(id))) + return; + + info = &engine_infos[id]; + + xe_gt_assert(gt, !hwe->gt); + + hwe->gt = gt; + hwe->class = info->class; + hwe->instance = info->instance; + hwe->mmio_base = info->mmio_base; + hwe->domain = info->domain; + hwe->name = info->name; + hwe->fence_irq = >->fence_irq[info->class]; + hwe->engine_id = id; + + hwe->eclass = >->eclass[hwe->class]; + if (!hwe->eclass->sched_props.job_timeout_ms) { + hwe->eclass->sched_props.job_timeout_ms = 5 * 1000; + hwe->eclass->sched_props.job_timeout_min = XE_HW_ENGINE_JOB_TIMEOUT_MIN; + hwe->eclass->sched_props.job_timeout_max = XE_HW_ENGINE_JOB_TIMEOUT_MAX; + hwe->eclass->sched_props.timeslice_us = 1 * 1000; + hwe->eclass->sched_props.timeslice_min = XE_HW_ENGINE_TIMESLICE_MIN; + hwe->eclass->sched_props.timeslice_max = XE_HW_ENGINE_TIMESLICE_MAX; + hwe->eclass->sched_props.preempt_timeout_us = XE_HW_ENGINE_PREEMPT_TIMEOUT; + hwe->eclass->sched_props.preempt_timeout_min = XE_HW_ENGINE_PREEMPT_TIMEOUT_MIN; + hwe->eclass->sched_props.preempt_timeout_max = XE_HW_ENGINE_PREEMPT_TIMEOUT_MAX; + /* Record default props */ + hwe->eclass->defaults = hwe->eclass->sched_props; + } + + xe_reg_sr_init(&hwe->reg_sr, hwe->name, gt_to_xe(gt)); + xe_tuning_process_engine(hwe); + xe_wa_process_engine(hwe); + hw_engine_setup_default_state(hwe); + + xe_reg_sr_init(&hwe->reg_whitelist, hwe->name, gt_to_xe(gt)); + xe_reg_whitelist_process_engine(hwe); +} + +static int hw_engine_init(struct xe_gt *gt, struct xe_hw_engine *hwe, + enum xe_hw_engine_id id) +{ + struct xe_device *xe = gt_to_xe(gt); + struct xe_tile *tile = gt_to_tile(gt); + int err; + + xe_gt_assert(gt, id < ARRAY_SIZE(engine_infos) && engine_infos[id].name); + xe_gt_assert(gt, gt->info.engine_mask & BIT(id)); + + xe_reg_sr_apply_mmio(&hwe->reg_sr, gt); + xe_reg_sr_apply_whitelist(hwe); + + hwe->hwsp = xe_managed_bo_create_pin_map(xe, tile, SZ_4K, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(hwe->hwsp)) { + err = PTR_ERR(hwe->hwsp); + goto err_name; + } + + err = xe_lrc_init(&hwe->kernel_lrc, hwe, NULL, NULL, SZ_16K); + if (err) + goto err_hwsp; + + if (!xe_device_uc_enabled(xe)) { + hwe->exl_port = xe_execlist_port_create(xe, hwe); + if (IS_ERR(hwe->exl_port)) { + err = PTR_ERR(hwe->exl_port); + goto err_kernel_lrc; + } + } + + if (xe_device_uc_enabled(xe)) + xe_hw_engine_enable_ring(hwe); + + /* We reserve the highest BCS instance for USM */ + if (xe->info.has_usm && hwe->class == XE_ENGINE_CLASS_COPY) + gt->usm.reserved_bcs_instance = hwe->instance; + + err = drmm_add_action_or_reset(&xe->drm, hw_engine_fini, hwe); + if (err) + return err; + + return 0; + +err_kernel_lrc: + xe_lrc_finish(&hwe->kernel_lrc); +err_hwsp: + xe_bo_unpin_map_no_vm(hwe->hwsp); +err_name: + hwe->name = NULL; + + return err; +} + +static void hw_engine_setup_logical_mapping(struct xe_gt *gt) +{ + int class; + + /* FIXME: Doing a simple logical mapping that works for most hardware */ + for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) { + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + int logical_instance = 0; + + for_each_hw_engine(hwe, gt, id) + if (hwe->class == class) + hwe->logical_instance = logical_instance++; + } +} + +static void read_media_fuses(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 media_fuse; + u16 vdbox_mask; + u16 vebox_mask; + int i, j; + + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + + media_fuse = xe_mmio_read32(gt, GT_VEBOX_VDBOX_DISABLE); + + /* + * Pre-Xe_HP platforms had register bits representing absent engines, + * whereas Xe_HP and beyond have bits representing present engines. + * Invert the polarity on old platforms so that we can use common + * handling below. + */ + if (GRAPHICS_VERx100(xe) < 1250) + media_fuse = ~media_fuse; + + vdbox_mask = REG_FIELD_GET(GT_VDBOX_DISABLE_MASK, media_fuse); + vebox_mask = REG_FIELD_GET(GT_VEBOX_DISABLE_MASK, media_fuse); + + for (i = XE_HW_ENGINE_VCS0, j = 0; i <= XE_HW_ENGINE_VCS7; ++i, ++j) { + if (!(gt->info.engine_mask & BIT(i))) + continue; + + if (!(BIT(j) & vdbox_mask)) { + gt->info.engine_mask &= ~BIT(i); + drm_info(&xe->drm, "vcs%u fused off\n", j); + } + } + + for (i = XE_HW_ENGINE_VECS0, j = 0; i <= XE_HW_ENGINE_VECS3; ++i, ++j) { + if (!(gt->info.engine_mask & BIT(i))) + continue; + + if (!(BIT(j) & vebox_mask)) { + gt->info.engine_mask &= ~BIT(i); + drm_info(&xe->drm, "vecs%u fused off\n", j); + } + } +} + +static void read_copy_fuses(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 bcs_mask; + + if (GRAPHICS_VERx100(xe) < 1260 || GRAPHICS_VERx100(xe) >= 1270) + return; + + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + + bcs_mask = xe_mmio_read32(gt, MIRROR_FUSE3); + bcs_mask = REG_FIELD_GET(MEML3_EN_MASK, bcs_mask); + + /* BCS0 is always present; only BCS1-BCS8 may be fused off */ + for (int i = XE_HW_ENGINE_BCS1, j = 0; i <= XE_HW_ENGINE_BCS8; ++i, ++j) { + if (!(gt->info.engine_mask & BIT(i))) + continue; + + if (!(BIT(j / 2) & bcs_mask)) { + gt->info.engine_mask &= ~BIT(i); + drm_info(&xe->drm, "bcs%u fused off\n", j); + } + } +} + +static void read_compute_fuses_from_dss(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + + /* + * CCS fusing based on DSS masks only applies to platforms that can + * have more than one CCS. + */ + if (hweight64(gt->info.engine_mask & + GENMASK_ULL(XE_HW_ENGINE_CCS3, XE_HW_ENGINE_CCS0)) <= 1) + return; + + /* + * CCS availability on Xe_HP is inferred from the presence of DSS in + * each quadrant. + */ + for (int i = XE_HW_ENGINE_CCS0, j = 0; i <= XE_HW_ENGINE_CCS3; ++i, ++j) { + if (!(gt->info.engine_mask & BIT(i))) + continue; + + if (!xe_gt_topology_has_dss_in_quadrant(gt, j)) { + gt->info.engine_mask &= ~BIT(i); + drm_info(&xe->drm, "ccs%u fused off\n", j); + } + } +} + +static void read_compute_fuses_from_reg(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 ccs_mask; + + ccs_mask = xe_mmio_read32(gt, XEHP_FUSE4); + ccs_mask = REG_FIELD_GET(CCS_EN_MASK, ccs_mask); + + for (int i = XE_HW_ENGINE_CCS0, j = 0; i <= XE_HW_ENGINE_CCS3; ++i, ++j) { + if (!(gt->info.engine_mask & BIT(i))) + continue; + + if ((ccs_mask & BIT(j)) == 0) { + gt->info.engine_mask &= ~BIT(i); + drm_info(&xe->drm, "ccs%u fused off\n", j); + } + } +} + +static void read_compute_fuses(struct xe_gt *gt) +{ + if (GRAPHICS_VER(gt_to_xe(gt)) >= 20) + read_compute_fuses_from_reg(gt); + else + read_compute_fuses_from_dss(gt); +} + +static void check_gsc_availability(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + + if (!(gt->info.engine_mask & BIT(XE_HW_ENGINE_GSCCS0))) + return; + + /* + * The GSCCS is only used to communicate with the GSC FW, so if we don't + * have the FW there is nothing we need the engine for and can therefore + * skip its initialization. + */ + if (!xe_uc_fw_is_available(>->uc.gsc.fw)) { + gt->info.engine_mask &= ~BIT(XE_HW_ENGINE_GSCCS0); + drm_info(&xe->drm, "gsccs disabled due to lack of FW\n"); + } +} + +int xe_hw_engines_init_early(struct xe_gt *gt) +{ + int i; + + read_media_fuses(gt); + read_copy_fuses(gt); + read_compute_fuses(gt); + check_gsc_availability(gt); + + BUILD_BUG_ON(XE_HW_ENGINE_PREEMPT_TIMEOUT < XE_HW_ENGINE_PREEMPT_TIMEOUT_MIN); + BUILD_BUG_ON(XE_HW_ENGINE_PREEMPT_TIMEOUT > XE_HW_ENGINE_PREEMPT_TIMEOUT_MAX); + + for (i = 0; i < ARRAY_SIZE(gt->hw_engines); i++) + hw_engine_init_early(gt, >->hw_engines[i], i); + + return 0; +} + +int xe_hw_engines_init(struct xe_gt *gt) +{ + int err; + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + + for_each_hw_engine(hwe, gt, id) { + err = hw_engine_init(gt, hwe, id); + if (err) + return err; + } + + hw_engine_setup_logical_mapping(gt); + + return 0; +} + +void xe_hw_engine_handle_irq(struct xe_hw_engine *hwe, u16 intr_vec) +{ + wake_up_all(>_to_xe(hwe->gt)->ufence_wq); + + if (hwe->irq_handler) + hwe->irq_handler(hwe, intr_vec); + + if (intr_vec & GT_RENDER_USER_INTERRUPT) + xe_hw_fence_irq_run(hwe->fence_irq); +} + +/** + * xe_hw_engine_snapshot_capture - Take a quick snapshot of the HW Engine. + * @hwe: Xe HW Engine. + * + * This can be printed out in a later stage like during dev_coredump + * analysis. + * + * Returns: a Xe HW Engine snapshot object that must be freed by the + * caller, using `xe_hw_engine_snapshot_free`. + */ +struct xe_hw_engine_snapshot * +xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe) +{ + struct xe_hw_engine_snapshot *snapshot; + int len; + + if (!xe_hw_engine_is_valid(hwe)) + return NULL; + + snapshot = kzalloc(sizeof(*snapshot), GFP_ATOMIC); + + if (!snapshot) + return NULL; + + len = strlen(hwe->name) + 1; + snapshot->name = kzalloc(len, GFP_ATOMIC); + if (snapshot->name) + strscpy(snapshot->name, hwe->name, len); + + snapshot->class = hwe->class; + snapshot->logical_instance = hwe->logical_instance; + snapshot->forcewake.domain = hwe->domain; + snapshot->forcewake.ref = xe_force_wake_ref(gt_to_fw(hwe->gt), + hwe->domain); + snapshot->mmio_base = hwe->mmio_base; + + snapshot->reg.ring_hwstam = hw_engine_mmio_read32(hwe, RING_HWSTAM(0)); + snapshot->reg.ring_hws_pga = hw_engine_mmio_read32(hwe, + RING_HWS_PGA(0)); + snapshot->reg.ring_execlist_status_lo = + hw_engine_mmio_read32(hwe, RING_EXECLIST_STATUS_LO(0)); + snapshot->reg.ring_execlist_status_hi = + hw_engine_mmio_read32(hwe, RING_EXECLIST_STATUS_HI(0)); + snapshot->reg.ring_execlist_sq_contents_lo = + hw_engine_mmio_read32(hwe, + RING_EXECLIST_SQ_CONTENTS_LO(0)); + snapshot->reg.ring_execlist_sq_contents_hi = + hw_engine_mmio_read32(hwe, + RING_EXECLIST_SQ_CONTENTS_HI(0)); + snapshot->reg.ring_start = hw_engine_mmio_read32(hwe, RING_START(0)); + snapshot->reg.ring_head = + hw_engine_mmio_read32(hwe, RING_HEAD(0)) & HEAD_ADDR; + snapshot->reg.ring_tail = + hw_engine_mmio_read32(hwe, RING_TAIL(0)) & TAIL_ADDR; + snapshot->reg.ring_ctl = hw_engine_mmio_read32(hwe, RING_CTL(0)); + snapshot->reg.ring_mi_mode = + hw_engine_mmio_read32(hwe, RING_MI_MODE(0)); + snapshot->reg.ring_mode = hw_engine_mmio_read32(hwe, RING_MODE(0)); + snapshot->reg.ring_imr = hw_engine_mmio_read32(hwe, RING_IMR(0)); + snapshot->reg.ring_esr = hw_engine_mmio_read32(hwe, RING_ESR(0)); + snapshot->reg.ring_emr = hw_engine_mmio_read32(hwe, RING_EMR(0)); + snapshot->reg.ring_eir = hw_engine_mmio_read32(hwe, RING_EIR(0)); + snapshot->reg.ring_acthd_udw = + hw_engine_mmio_read32(hwe, RING_ACTHD_UDW(0)); + snapshot->reg.ring_acthd = hw_engine_mmio_read32(hwe, RING_ACTHD(0)); + snapshot->reg.ring_bbaddr_udw = + hw_engine_mmio_read32(hwe, RING_BBADDR_UDW(0)); + snapshot->reg.ring_bbaddr = hw_engine_mmio_read32(hwe, RING_BBADDR(0)); + snapshot->reg.ring_dma_fadd_udw = + hw_engine_mmio_read32(hwe, RING_DMA_FADD_UDW(0)); + snapshot->reg.ring_dma_fadd = + hw_engine_mmio_read32(hwe, RING_DMA_FADD(0)); + snapshot->reg.ipehr = hw_engine_mmio_read32(hwe, RING_IPEHR(0)); + + if (snapshot->class == XE_ENGINE_CLASS_COMPUTE) + snapshot->reg.rcu_mode = xe_mmio_read32(hwe->gt, RCU_MODE); + + return snapshot; +} + +/** + * xe_hw_engine_snapshot_print - Print out a given Xe HW Engine snapshot. + * @snapshot: Xe HW Engine snapshot object. + * @p: drm_printer where it will be printed out. + * + * This function prints out a given Xe HW Engine snapshot object. + */ +void xe_hw_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot, + struct drm_printer *p) +{ + if (!snapshot) + return; + + drm_printf(p, "%s (physical), logical instance=%d\n", + snapshot->name ? snapshot->name : "", + snapshot->logical_instance); + drm_printf(p, "\tForcewake: domain 0x%x, ref %d\n", + snapshot->forcewake.domain, snapshot->forcewake.ref); + drm_printf(p, "\tHWSTAM: 0x%08x\n", snapshot->reg.ring_hwstam); + drm_printf(p, "\tRING_HWS_PGA: 0x%08x\n", snapshot->reg.ring_hws_pga); + drm_printf(p, "\tRING_EXECLIST_STATUS_LO: 0x%08x\n", + snapshot->reg.ring_execlist_status_lo); + drm_printf(p, "\tRING_EXECLIST_STATUS_HI: 0x%08x\n", + snapshot->reg.ring_execlist_status_hi); + drm_printf(p, "\tRING_EXECLIST_SQ_CONTENTS_LO: 0x%08x\n", + snapshot->reg.ring_execlist_sq_contents_lo); + drm_printf(p, "\tRING_EXECLIST_SQ_CONTENTS_HI: 0x%08x\n", + snapshot->reg.ring_execlist_sq_contents_hi); + drm_printf(p, "\tRING_START: 0x%08x\n", snapshot->reg.ring_start); + drm_printf(p, "\tRING_HEAD: 0x%08x\n", snapshot->reg.ring_head); + drm_printf(p, "\tRING_TAIL: 0x%08x\n", snapshot->reg.ring_tail); + drm_printf(p, "\tRING_CTL: 0x%08x\n", snapshot->reg.ring_ctl); + drm_printf(p, "\tRING_MI_MODE: 0x%08x\n", snapshot->reg.ring_mi_mode); + drm_printf(p, "\tRING_MODE: 0x%08x\n", + snapshot->reg.ring_mode); + drm_printf(p, "\tRING_IMR: 0x%08x\n", snapshot->reg.ring_imr); + drm_printf(p, "\tRING_ESR: 0x%08x\n", snapshot->reg.ring_esr); + drm_printf(p, "\tRING_EMR: 0x%08x\n", snapshot->reg.ring_emr); + drm_printf(p, "\tRING_EIR: 0x%08x\n", snapshot->reg.ring_eir); + drm_printf(p, "\tACTHD: 0x%08x_%08x\n", snapshot->reg.ring_acthd_udw, + snapshot->reg.ring_acthd); + drm_printf(p, "\tBBADDR: 0x%08x_%08x\n", snapshot->reg.ring_bbaddr_udw, + snapshot->reg.ring_bbaddr); + drm_printf(p, "\tDMA_FADDR: 0x%08x_%08x\n", + snapshot->reg.ring_dma_fadd_udw, + snapshot->reg.ring_dma_fadd); + drm_printf(p, "\tIPEHR: 0x%08x\n\n", snapshot->reg.ipehr); + if (snapshot->class == XE_ENGINE_CLASS_COMPUTE) + drm_printf(p, "\tRCU_MODE: 0x%08x\n", + snapshot->reg.rcu_mode); +} + +/** + * xe_hw_engine_snapshot_free - Free all allocated objects for a given snapshot. + * @snapshot: Xe HW Engine snapshot object. + * + * This function free all the memory that needed to be allocated at capture + * time. + */ +void xe_hw_engine_snapshot_free(struct xe_hw_engine_snapshot *snapshot) +{ + if (!snapshot) + return; + + kfree(snapshot->name); + kfree(snapshot); +} + +/** + * xe_hw_engine_print - Xe HW Engine Print. + * @hwe: Hardware Engine. + * @p: drm_printer. + * + * This function quickly capture a snapshot and immediately print it out. + */ +void xe_hw_engine_print(struct xe_hw_engine *hwe, struct drm_printer *p) +{ + struct xe_hw_engine_snapshot *snapshot; + + snapshot = xe_hw_engine_snapshot_capture(hwe); + xe_hw_engine_snapshot_print(snapshot, p); + xe_hw_engine_snapshot_free(snapshot); +} + +u32 xe_hw_engine_mask_per_class(struct xe_gt *gt, + enum xe_engine_class engine_class) +{ + u32 mask = 0; + enum xe_hw_engine_id id; + + for (id = 0; id < XE_NUM_HW_ENGINES; ++id) { + if (engine_infos[id].class == engine_class && + gt->info.engine_mask & BIT(id)) + mask |= BIT(engine_infos[id].instance); + } + return mask; +} + +bool xe_hw_engine_is_reserved(struct xe_hw_engine *hwe) +{ + struct xe_gt *gt = hwe->gt; + struct xe_device *xe = gt_to_xe(gt); + + if (hwe->class == XE_ENGINE_CLASS_OTHER) + return true; + + /* Check for engines disabled by ccs_mode setting */ + if (xe_gt_ccs_mode_enabled(gt) && + hwe->class == XE_ENGINE_CLASS_COMPUTE && + hwe->logical_instance >= gt->ccs_mode) + return true; + + return xe->info.has_usm && hwe->class == XE_ENGINE_CLASS_COPY && + hwe->instance == gt->usm.reserved_bcs_instance; +} diff --git a/drivers/gpu/drm/xe/xe_hw_engine.h b/drivers/gpu/drm/xe/xe_hw_engine.h new file mode 100644 index 000000000000..71968ee2f600 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_engine.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_HW_ENGINE_H_ +#define _XE_HW_ENGINE_H_ + +#include "xe_hw_engine_types.h" + +struct drm_printer; + +#ifdef CONFIG_DRM_XE_JOB_TIMEOUT_MIN +#define XE_HW_ENGINE_JOB_TIMEOUT_MIN CONFIG_DRM_XE_JOB_TIMEOUT_MIN +#else +#define XE_HW_ENGINE_JOB_TIMEOUT_MIN 1 +#endif +#ifdef CONFIG_DRM_XE_JOB_TIMEOUT_MAX +#define XE_HW_ENGINE_JOB_TIMEOUT_MAX CONFIG_DRM_XE_JOB_TIMEOUT_MAX +#else +#define XE_HW_ENGINE_JOB_TIMEOUT_MAX (10 * 1000) +#endif +#ifdef CONFIG_DRM_XE_TIMESLICE_MIN +#define XE_HW_ENGINE_TIMESLICE_MIN CONFIG_DRM_XE_TIMESLICE_MIN +#else +#define XE_HW_ENGINE_TIMESLICE_MIN 1 +#endif +#ifdef CONFIG_DRM_XE_TIMESLICE_MAX +#define XE_HW_ENGINE_TIMESLICE_MAX CONFIG_DRM_XE_TIMESLICE_MAX +#else +#define XE_HW_ENGINE_TIMESLICE_MAX (10 * 1000 * 1000) +#endif +#ifdef CONFIG_DRM_XE_PREEMPT_TIMEOUT +#define XE_HW_ENGINE_PREEMPT_TIMEOUT CONFIG_DRM_XE_PREEMPT_TIMEOUT +#else +#define XE_HW_ENGINE_PREEMPT_TIMEOUT (640 * 1000) +#endif +#ifdef CONFIG_DRM_XE_PREEMPT_TIMEOUT_MIN +#define XE_HW_ENGINE_PREEMPT_TIMEOUT_MIN CONFIG_DRM_XE_PREEMPT_TIMEOUT_MIN +#else +#define XE_HW_ENGINE_PREEMPT_TIMEOUT_MIN 1 +#endif +#ifdef CONFIG_DRM_XE_PREEMPT_TIMEOUT_MAX +#define XE_HW_ENGINE_PREEMPT_TIMEOUT_MAX CONFIG_DRM_XE_PREEMPT_TIMEOUT_MAX +#else +#define XE_HW_ENGINE_PREEMPT_TIMEOUT_MAX (10 * 1000 * 1000) +#endif + +int xe_hw_engines_init_early(struct xe_gt *gt); +int xe_hw_engines_init(struct xe_gt *gt); +void xe_hw_engine_handle_irq(struct xe_hw_engine *hwe, u16 intr_vec); +void xe_hw_engine_enable_ring(struct xe_hw_engine *hwe); +u32 xe_hw_engine_mask_per_class(struct xe_gt *gt, + enum xe_engine_class engine_class); + +struct xe_hw_engine_snapshot * +xe_hw_engine_snapshot_capture(struct xe_hw_engine *hwe); +void xe_hw_engine_snapshot_free(struct xe_hw_engine_snapshot *snapshot); +void xe_hw_engine_snapshot_print(struct xe_hw_engine_snapshot *snapshot, + struct drm_printer *p); +void xe_hw_engine_print(struct xe_hw_engine *hwe, struct drm_printer *p); +void xe_hw_engine_setup_default_lrc_state(struct xe_hw_engine *hwe); + +bool xe_hw_engine_is_reserved(struct xe_hw_engine *hwe); +static inline bool xe_hw_engine_is_valid(struct xe_hw_engine *hwe) +{ + return hwe->name; +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c new file mode 100644 index 000000000000..e49bc14f0ecf --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.c @@ -0,0 +1,675 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <drm/drm_managed.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> + +#include "xe_gt.h" +#include "xe_hw_engine_class_sysfs.h" + +#define MAX_ENGINE_CLASS_NAME_LEN 16 +static int xe_add_hw_engine_class_defaults(struct xe_device *xe, + struct kobject *parent); + +/** + * xe_hw_engine_timeout_in_range - Helper to check if timeout is in range + * @timeout: timeout to validate + * @min: min value of valid range + * @max: max value of valid range + * + * This helper helps to validate if timeout is in min-max range of HW engine + * scheduler. + * + * Returns: Returns false value for failure and true for success. + */ +bool xe_hw_engine_timeout_in_range(u64 timeout, u64 min, u64 max) +{ + return timeout >= min && timeout <= max; +} + +static void kobj_xe_hw_engine_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct kobj_type kobj_xe_hw_engine_type = { + .release = kobj_xe_hw_engine_release, + .sysfs_ops = &kobj_sysfs_ops +}; + +static ssize_t job_timeout_max_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + u32 timeout; + int err; + + err = kstrtou32(buf, 0, &timeout); + if (err) + return err; + + if (timeout < eclass->sched_props.job_timeout_min) + return -EINVAL; + + if (!xe_hw_engine_timeout_in_range(timeout, + XE_HW_ENGINE_JOB_TIMEOUT_MIN, + XE_HW_ENGINE_JOB_TIMEOUT_MAX)) + return -EINVAL; + + WRITE_ONCE(eclass->sched_props.job_timeout_max, timeout); + + return count; +} + +static ssize_t job_timeout_max_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + + return sprintf(buf, "%u\n", eclass->sched_props.job_timeout_max); +} + +static struct kobj_attribute job_timeout_max_attr = +__ATTR(job_timeout_max, 0644, job_timeout_max_show, job_timeout_max_store); + +static ssize_t job_timeout_min_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + u32 timeout; + int err; + + err = kstrtou32(buf, 0, &timeout); + if (err) + return err; + + if (timeout > eclass->sched_props.job_timeout_max) + return -EINVAL; + + if (!xe_hw_engine_timeout_in_range(timeout, + XE_HW_ENGINE_JOB_TIMEOUT_MIN, + XE_HW_ENGINE_JOB_TIMEOUT_MAX)) + return -EINVAL; + + WRITE_ONCE(eclass->sched_props.job_timeout_min, timeout); + + return count; +} + +static ssize_t job_timeout_min_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + + return sprintf(buf, "%u\n", eclass->sched_props.job_timeout_min); +} + +static struct kobj_attribute job_timeout_min_attr = +__ATTR(job_timeout_min, 0644, job_timeout_min_show, job_timeout_min_store); + +static ssize_t job_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + u32 min = eclass->sched_props.job_timeout_min; + u32 max = eclass->sched_props.job_timeout_max; + u32 timeout; + int err; + + err = kstrtou32(buf, 0, &timeout); + if (err) + return err; + + if (!xe_hw_engine_timeout_in_range(timeout, min, max)) + return -EINVAL; + + WRITE_ONCE(eclass->sched_props.job_timeout_ms, timeout); + + return count; +} + +static ssize_t job_timeout_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + + return sprintf(buf, "%u\n", eclass->sched_props.job_timeout_ms); +} + +static struct kobj_attribute job_timeout_attr = +__ATTR(job_timeout_ms, 0644, job_timeout_show, job_timeout_store); + +static ssize_t job_timeout_default(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj->parent); + + return sprintf(buf, "%u\n", eclass->defaults.job_timeout_ms); +} + +static struct kobj_attribute job_timeout_def = +__ATTR(job_timeout_ms, 0444, job_timeout_default, NULL); + +static ssize_t job_timeout_min_default(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj->parent); + + return sprintf(buf, "%u\n", eclass->defaults.job_timeout_min); +} + +static struct kobj_attribute job_timeout_min_def = +__ATTR(job_timeout_min, 0444, job_timeout_min_default, NULL); + +static ssize_t job_timeout_max_default(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj->parent); + + return sprintf(buf, "%u\n", eclass->defaults.job_timeout_max); +} + +static struct kobj_attribute job_timeout_max_def = +__ATTR(job_timeout_max, 0444, job_timeout_max_default, NULL); + +static ssize_t timeslice_duration_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + u32 min = eclass->sched_props.timeslice_min; + u32 max = eclass->sched_props.timeslice_max; + u32 duration; + int err; + + err = kstrtou32(buf, 0, &duration); + if (err) + return err; + + if (!xe_hw_engine_timeout_in_range(duration, min, max)) + return -EINVAL; + + WRITE_ONCE(eclass->sched_props.timeslice_us, duration); + + return count; +} + +static ssize_t timeslice_duration_max_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + u32 duration; + int err; + + err = kstrtou32(buf, 0, &duration); + if (err) + return err; + + if (duration < eclass->sched_props.timeslice_min) + return -EINVAL; + + if (!xe_hw_engine_timeout_in_range(duration, + XE_HW_ENGINE_TIMESLICE_MIN, + XE_HW_ENGINE_TIMESLICE_MAX)) + return -EINVAL; + + WRITE_ONCE(eclass->sched_props.timeslice_max, duration); + + return count; +} + +static ssize_t timeslice_duration_max_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + + return sprintf(buf, "%u\n", eclass->sched_props.timeslice_max); +} + +static struct kobj_attribute timeslice_duration_max_attr = + __ATTR(timeslice_duration_max, 0644, timeslice_duration_max_show, + timeslice_duration_max_store); + +static ssize_t timeslice_duration_min_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + u32 duration; + int err; + + err = kstrtou32(buf, 0, &duration); + if (err) + return err; + + if (duration > eclass->sched_props.timeslice_max) + return -EINVAL; + + if (!xe_hw_engine_timeout_in_range(duration, + XE_HW_ENGINE_TIMESLICE_MIN, + XE_HW_ENGINE_TIMESLICE_MAX)) + return -EINVAL; + + WRITE_ONCE(eclass->sched_props.timeslice_min, duration); + + return count; +} + +static ssize_t timeslice_duration_min_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + + return sprintf(buf, "%u\n", eclass->sched_props.timeslice_min); +} + +static struct kobj_attribute timeslice_duration_min_attr = + __ATTR(timeslice_duration_min, 0644, timeslice_duration_min_show, + timeslice_duration_min_store); + +static ssize_t timeslice_duration_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + + return sprintf(buf, "%u\n", eclass->sched_props.timeslice_us); +} + +static struct kobj_attribute timeslice_duration_attr = + __ATTR(timeslice_duration_us, 0644, timeslice_duration_show, + timeslice_duration_store); + +static ssize_t timeslice_default(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj->parent); + + return sprintf(buf, "%u\n", eclass->defaults.timeslice_us); +} + +static struct kobj_attribute timeslice_duration_def = +__ATTR(timeslice_duration_us, 0444, timeslice_default, NULL); + +static ssize_t timeslice_min_default(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj->parent); + + return sprintf(buf, "%u\n", eclass->defaults.timeslice_min); +} + +static struct kobj_attribute timeslice_duration_min_def = +__ATTR(timeslice_duration_min, 0444, timeslice_min_default, NULL); + +static ssize_t timeslice_max_default(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj->parent); + + return sprintf(buf, "%u\n", eclass->defaults.timeslice_max); +} + +static struct kobj_attribute timeslice_duration_max_def = +__ATTR(timeslice_duration_max, 0444, timeslice_max_default, NULL); + +static ssize_t preempt_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + u32 min = eclass->sched_props.preempt_timeout_min; + u32 max = eclass->sched_props.preempt_timeout_max; + u32 timeout; + int err; + + err = kstrtou32(buf, 0, &timeout); + if (err) + return err; + + if (!xe_hw_engine_timeout_in_range(timeout, min, max)) + return -EINVAL; + + WRITE_ONCE(eclass->sched_props.preempt_timeout_us, timeout); + + return count; +} + +static ssize_t preempt_timeout_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + + return sprintf(buf, "%u\n", eclass->sched_props.preempt_timeout_us); +} + +static struct kobj_attribute preempt_timeout_attr = +__ATTR(preempt_timeout_us, 0644, preempt_timeout_show, preempt_timeout_store); + +static ssize_t preempt_timeout_default(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj->parent); + + return sprintf(buf, "%u\n", eclass->defaults.preempt_timeout_us); +} + +static struct kobj_attribute preempt_timeout_def = +__ATTR(preempt_timeout_us, 0444, preempt_timeout_default, NULL); + +static ssize_t preempt_timeout_min_default(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj->parent); + + return sprintf(buf, "%u\n", eclass->defaults.preempt_timeout_min); +} + +static struct kobj_attribute preempt_timeout_min_def = +__ATTR(preempt_timeout_min, 0444, preempt_timeout_min_default, NULL); + +static ssize_t preempt_timeout_max_default(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj->parent); + + return sprintf(buf, "%u\n", eclass->defaults.preempt_timeout_max); +} + +static struct kobj_attribute preempt_timeout_max_def = +__ATTR(preempt_timeout_max, 0444, preempt_timeout_max_default, NULL); + +static ssize_t preempt_timeout_max_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + u32 timeout; + int err; + + err = kstrtou32(buf, 0, &timeout); + if (err) + return err; + + if (timeout < eclass->sched_props.preempt_timeout_min) + return -EINVAL; + + if (!xe_hw_engine_timeout_in_range(timeout, + XE_HW_ENGINE_PREEMPT_TIMEOUT_MIN, + XE_HW_ENGINE_PREEMPT_TIMEOUT_MAX)) + return -EINVAL; + + WRITE_ONCE(eclass->sched_props.preempt_timeout_max, timeout); + + return count; +} + +static ssize_t preempt_timeout_max_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + + return sprintf(buf, "%u\n", eclass->sched_props.preempt_timeout_max); +} + +static struct kobj_attribute preempt_timeout_max_attr = + __ATTR(preempt_timeout_max, 0644, preempt_timeout_max_show, + preempt_timeout_max_store); + +static ssize_t preempt_timeout_min_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + u32 timeout; + int err; + + err = kstrtou32(buf, 0, &timeout); + if (err) + return err; + + if (timeout > eclass->sched_props.preempt_timeout_max) + return -EINVAL; + + if (!xe_hw_engine_timeout_in_range(timeout, + XE_HW_ENGINE_PREEMPT_TIMEOUT_MIN, + XE_HW_ENGINE_PREEMPT_TIMEOUT_MAX)) + return -EINVAL; + + WRITE_ONCE(eclass->sched_props.preempt_timeout_min, timeout); + + return count; +} + +static ssize_t preempt_timeout_min_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct xe_hw_engine_class_intf *eclass = kobj_to_eclass(kobj); + + return sprintf(buf, "%u\n", eclass->sched_props.preempt_timeout_min); +} + +static struct kobj_attribute preempt_timeout_min_attr = + __ATTR(preempt_timeout_min, 0644, preempt_timeout_min_show, + preempt_timeout_min_store); + +static const struct attribute *defaults[] = { + &job_timeout_def.attr, + &job_timeout_min_def.attr, + &job_timeout_max_def.attr, + ×lice_duration_def.attr, + ×lice_duration_min_def.attr, + ×lice_duration_max_def.attr, + &preempt_timeout_def.attr, + &preempt_timeout_min_def.attr, + &preempt_timeout_max_def.attr, + NULL +}; + +static const struct attribute *files[] = { + &job_timeout_attr.attr, + &job_timeout_min_attr.attr, + &job_timeout_max_attr.attr, + ×lice_duration_attr.attr, + ×lice_duration_min_attr.attr, + ×lice_duration_max_attr.attr, + &preempt_timeout_attr.attr, + &preempt_timeout_min_attr.attr, + &preempt_timeout_max_attr.attr, + NULL +}; + +static void kobj_xe_hw_engine_class_fini(struct drm_device *drm, void *arg) +{ + struct kobject *kobj = arg; + + sysfs_remove_files(kobj, files); + kobject_put(kobj); +} + + static struct kobj_eclass * +kobj_xe_hw_engine_class(struct xe_device *xe, struct kobject *parent, char *name) +{ + struct kobj_eclass *keclass; + int err = 0; + + keclass = kzalloc(sizeof(*keclass), GFP_KERNEL); + if (!keclass) + return NULL; + + kobject_init(&keclass->base, &kobj_xe_hw_engine_type); + if (kobject_add(&keclass->base, parent, "%s", name)) { + kobject_put(&keclass->base); + return NULL; + } + + err = drmm_add_action_or_reset(&xe->drm, kobj_xe_hw_engine_class_fini, + &keclass->base); + if (err) + drm_warn(&xe->drm, + "%s: drmm_add_action_or_reset failed, err: %d\n", + __func__, err); + return keclass; +} + +static void hw_engine_class_defaults_fini(struct drm_device *drm, void *arg) +{ + struct kobject *kobj = arg; + + sysfs_remove_files(kobj, defaults); + kobject_put(kobj); +} + +static int xe_add_hw_engine_class_defaults(struct xe_device *xe, + struct kobject *parent) +{ + struct kobject *kobj; + int err = 0; + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (!kobj) + return -ENOMEM; + + kobject_init(kobj, &kobj_xe_hw_engine_type); + err = kobject_add(kobj, parent, "%s", ".defaults"); + if (err) + goto err_object; + + err = sysfs_create_files(kobj, defaults); + if (err) + goto err_object; + + err = drmm_add_action_or_reset(&xe->drm, hw_engine_class_defaults_fini, + kobj); + if (err) + drm_warn(&xe->drm, + "%s: drmm_add_action_or_reset failed, err: %d\n", + __func__, err); + return err; +err_object: + kobject_put(kobj); + return err; +} + +static void xe_hw_engine_sysfs_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct kobj_type xe_hw_engine_sysfs_kobj_type = { + .release = xe_hw_engine_sysfs_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +static void hw_engine_class_sysfs_fini(struct drm_device *drm, void *arg) +{ + struct kobject *kobj = arg; + + kobject_put(kobj); +} + +/** + * xe_hw_engine_class_sysfs_init - Init HW engine classes on GT. + * @gt: Xe GT. + * + * This routine creates sysfs for HW engine classes and adds methods + * to get/set different scheduling properties for HW engines class. + * + * Returns: Returns error value for failure and 0 for success. + */ +int xe_hw_engine_class_sysfs_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + struct kobject *kobj; + u16 class_mask = 0; + int err = 0; + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (!kobj) + return -ENOMEM; + + kobject_init(kobj, &xe_hw_engine_sysfs_kobj_type); + + err = kobject_add(kobj, gt->sysfs, "engines"); + if (err) + goto err_object; + + for_each_hw_engine(hwe, gt, id) { + char name[MAX_ENGINE_CLASS_NAME_LEN]; + struct kobj_eclass *keclass; + + if (hwe->class == XE_ENGINE_CLASS_OTHER || + hwe->class == XE_ENGINE_CLASS_MAX) + continue; + + if ((class_mask >> hwe->class) & 1) + continue; + + class_mask |= 1 << hwe->class; + + switch (hwe->class) { + case XE_ENGINE_CLASS_RENDER: + strcpy(name, "rcs"); + break; + case XE_ENGINE_CLASS_VIDEO_DECODE: + strcpy(name, "vcs"); + break; + case XE_ENGINE_CLASS_VIDEO_ENHANCE: + strcpy(name, "vecs"); + break; + case XE_ENGINE_CLASS_COPY: + strcpy(name, "bcs"); + break; + case XE_ENGINE_CLASS_COMPUTE: + strcpy(name, "ccs"); + break; + default: + err = -EINVAL; + goto err_object; + } + + keclass = kobj_xe_hw_engine_class(xe, kobj, name); + if (!keclass) { + err = -EINVAL; + goto err_object; + } + + keclass->eclass = hwe->eclass; + err = xe_add_hw_engine_class_defaults(xe, &keclass->base); + if (err) { + drm_warn(&xe->drm, + "Add .defaults to engines failed!, err: %d\n", + err); + goto err_object; + } + + err = sysfs_create_files(&keclass->base, files); + if (err) + goto err_object; + } + + err = drmm_add_action_or_reset(&xe->drm, hw_engine_class_sysfs_fini, + kobj); + if (err) + drm_warn(&xe->drm, + "%s: drmm_add_action_or_reset failed, err: %d\n", + __func__, err); + + return err; +err_object: + kobject_put(kobj); + return err; +} diff --git a/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.h b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.h new file mode 100644 index 000000000000..ec5ba673b314 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_engine_class_sysfs.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_ENGINE_CLASS_SYSFS_H_ +#define _XE_ENGINE_CLASS_SYSFS_H_ + +#include <linux/kobject.h> + +struct xe_gt; +struct xe_hw_engine_class_intf; + +int xe_hw_engine_class_sysfs_init(struct xe_gt *gt); +bool xe_hw_engine_timeout_in_range(u64 timeout, u64 min, u64 max); + +/** + * struct kobj_eclass - A eclass's kobject struct that connects the kobject and the + * eclass. + * + * When dealing with multiple eclass, this struct helps to understand which eclass + * needs to be addressed on a given sysfs call. + */ +struct kobj_eclass { + /** @base: The actual kobject */ + struct kobject base; + /** @eclass: A pointer to the hw engine class interface */ + struct xe_hw_engine_class_intf *eclass; +}; + +static inline struct xe_hw_engine_class_intf *kobj_to_eclass(struct kobject *kobj) +{ + return container_of(kobj, struct kobj_eclass, base)->eclass; +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h new file mode 100644 index 000000000000..39908dec042a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_HW_ENGINE_TYPES_H_ +#define _XE_HW_ENGINE_TYPES_H_ + +#include "xe_force_wake_types.h" +#include "xe_lrc_types.h" +#include "xe_reg_sr_types.h" + +/* See "Engine ID Definition" struct in the Icelake PRM */ +enum xe_engine_class { + XE_ENGINE_CLASS_RENDER = 0, + XE_ENGINE_CLASS_VIDEO_DECODE = 1, + XE_ENGINE_CLASS_VIDEO_ENHANCE = 2, + XE_ENGINE_CLASS_COPY = 3, + XE_ENGINE_CLASS_OTHER = 4, + XE_ENGINE_CLASS_COMPUTE = 5, + XE_ENGINE_CLASS_MAX = 6, +}; + +enum xe_hw_engine_id { + XE_HW_ENGINE_RCS0, +#define XE_HW_ENGINE_RCS_MASK GENMASK_ULL(XE_HW_ENGINE_RCS0, XE_HW_ENGINE_RCS0) + XE_HW_ENGINE_BCS0, + XE_HW_ENGINE_BCS1, + XE_HW_ENGINE_BCS2, + XE_HW_ENGINE_BCS3, + XE_HW_ENGINE_BCS4, + XE_HW_ENGINE_BCS5, + XE_HW_ENGINE_BCS6, + XE_HW_ENGINE_BCS7, + XE_HW_ENGINE_BCS8, +#define XE_HW_ENGINE_BCS_MASK GENMASK_ULL(XE_HW_ENGINE_BCS8, XE_HW_ENGINE_BCS0) + XE_HW_ENGINE_VCS0, + XE_HW_ENGINE_VCS1, + XE_HW_ENGINE_VCS2, + XE_HW_ENGINE_VCS3, + XE_HW_ENGINE_VCS4, + XE_HW_ENGINE_VCS5, + XE_HW_ENGINE_VCS6, + XE_HW_ENGINE_VCS7, +#define XE_HW_ENGINE_VCS_MASK GENMASK_ULL(XE_HW_ENGINE_VCS7, XE_HW_ENGINE_VCS0) + XE_HW_ENGINE_VECS0, + XE_HW_ENGINE_VECS1, + XE_HW_ENGINE_VECS2, + XE_HW_ENGINE_VECS3, +#define XE_HW_ENGINE_VECS_MASK GENMASK_ULL(XE_HW_ENGINE_VECS3, XE_HW_ENGINE_VECS0) + XE_HW_ENGINE_CCS0, + XE_HW_ENGINE_CCS1, + XE_HW_ENGINE_CCS2, + XE_HW_ENGINE_CCS3, +#define XE_HW_ENGINE_CCS_MASK GENMASK_ULL(XE_HW_ENGINE_CCS3, XE_HW_ENGINE_CCS0) + XE_HW_ENGINE_GSCCS0, +#define XE_HW_ENGINE_GSCCS_MASK GENMASK_ULL(XE_HW_ENGINE_GSCCS0, XE_HW_ENGINE_GSCCS0) + XE_NUM_HW_ENGINES, +}; + +/* FIXME: s/XE_HW_ENGINE_MAX_INSTANCE/XE_HW_ENGINE_MAX_COUNT */ +#define XE_HW_ENGINE_MAX_INSTANCE 9 + +struct xe_bo; +struct xe_execlist_port; +struct xe_gt; + +/** + * struct xe_hw_engine_class_intf - per hw engine class struct interface + * + * Contains all the hw engine properties per engine class. + * + * @sched_props: scheduling properties + * @defaults: default scheduling properties + */ +struct xe_hw_engine_class_intf { + /** + * @sched_props: scheduling properties + * @defaults: default scheduling properties + */ + struct { + /** @set_job_timeout: Set job timeout in ms for engine */ + u32 job_timeout_ms; + /** @job_timeout_min: Min job timeout in ms for engine */ + u32 job_timeout_min; + /** @job_timeout_max: Max job timeout in ms for engine */ + u32 job_timeout_max; + /** @timeslice_us: timeslice period in micro-seconds */ + u32 timeslice_us; + /** @timeslice_min: min timeslice period in micro-seconds */ + u32 timeslice_min; + /** @timeslice_max: max timeslice period in micro-seconds */ + u32 timeslice_max; + /** @preempt_timeout_us: preemption timeout in micro-seconds */ + u32 preempt_timeout_us; + /** @preempt_timeout_min: min preemption timeout in micro-seconds */ + u32 preempt_timeout_min; + /** @preempt_timeout_max: max preemption timeout in micro-seconds */ + u32 preempt_timeout_max; + } sched_props, defaults; +}; + +/** + * struct xe_hw_engine - Hardware engine + * + * Contains all the hardware engine state for physical instances. + */ +struct xe_hw_engine { + /** @gt: graphics tile this hw engine belongs to */ + struct xe_gt *gt; + /** @name: name of this hw engine */ + const char *name; + /** @class: class of this hw engine */ + enum xe_engine_class class; + /** @instance: physical instance of this hw engine */ + u16 instance; + /** @logical_instance: logical instance of this hw engine */ + u16 logical_instance; + /** @mmio_base: MMIO base address of this hw engine*/ + u32 mmio_base; + /** + * @reg_sr: table with registers to be restored on GT init/resume/reset + */ + struct xe_reg_sr reg_sr; + /** + * @reg_whitelist: table with registers to be whitelisted + */ + struct xe_reg_sr reg_whitelist; + /** + * @reg_lrc: LRC workaround registers + */ + struct xe_reg_sr reg_lrc; + /** @domain: force wake domain of this hw engine */ + enum xe_force_wake_domains domain; + /** @hwsp: hardware status page buffer object */ + struct xe_bo *hwsp; + /** @kernel_lrc: Kernel LRC (should be replaced /w an xe_engine) */ + struct xe_lrc kernel_lrc; + /** @exl_port: execlists port */ + struct xe_execlist_port *exl_port; + /** @fence_irq: fence IRQ to run when a hw engine IRQ is received */ + struct xe_hw_fence_irq *fence_irq; + /** @irq_handler: IRQ handler to run when hw engine IRQ is received */ + void (*irq_handler)(struct xe_hw_engine *hwe, u16 intr_vec); + /** @engine_id: id for this hw engine */ + enum xe_hw_engine_id engine_id; + /** @eclass: pointer to per hw engine class interface */ + struct xe_hw_engine_class_intf *eclass; +}; + +/** + * struct xe_hw_engine_snapshot - Hardware engine snapshot + * + * Contains the snapshot of useful hardware engine info and registers. + */ +struct xe_hw_engine_snapshot { + /** @name: name of the hw engine */ + char *name; + /** @class: class of this hw engine */ + enum xe_engine_class class; + /** @logical_instance: logical instance of this hw engine */ + u16 logical_instance; + /** @forcewake: Force Wake information snapshot */ + struct { + /** @domain: force wake domain of this hw engine */ + enum xe_force_wake_domains domain; + /** @ref: Forcewake ref for the above domain */ + int ref; + } forcewake; + /** @mmio_base: MMIO base address of this hw engine*/ + u32 mmio_base; + /** @reg: Useful MMIO register snapshot */ + struct { + /** @ring_hwstam: RING_HWSTAM */ + u32 ring_hwstam; + /** @ring_hws_pga: RING_HWS_PGA */ + u32 ring_hws_pga; + /** @ring_execlist_status_lo: RING_EXECLIST_STATUS_LO */ + u32 ring_execlist_status_lo; + /** @ring_execlist_status_hi: RING_EXECLIST_STATUS_HI */ + u32 ring_execlist_status_hi; + /** @ring_execlist_sq_contents_lo: RING_EXECLIST_SQ_CONTENTS */ + u32 ring_execlist_sq_contents_lo; + /** @ring_execlist_sq_contents_hi: RING_EXECLIST_SQ_CONTENTS + 4 */ + u32 ring_execlist_sq_contents_hi; + /** @ring_start: RING_START */ + u32 ring_start; + /** @ring_head: RING_HEAD */ + u32 ring_head; + /** @ring_tail: RING_TAIL */ + u32 ring_tail; + /** @ring_ctl: RING_CTL */ + u32 ring_ctl; + /** @ring_mi_mode: RING_MI_MODE */ + u32 ring_mi_mode; + /** @ring_mode: RING_MODE */ + u32 ring_mode; + /** @ring_imr: RING_IMR */ + u32 ring_imr; + /** @ring_esr: RING_ESR */ + u32 ring_esr; + /** @ring_emr: RING_EMR */ + u32 ring_emr; + /** @ring_eir: RING_EIR */ + u32 ring_eir; + /** @ring_acthd_udw: RING_ACTHD_UDW */ + u32 ring_acthd_udw; + /** @ring_acthd: RING_ACTHD */ + u32 ring_acthd; + /** @ring_bbaddr_udw: RING_BBADDR_UDW */ + u32 ring_bbaddr_udw; + /** @ring_bbaddr: RING_BBADDR */ + u32 ring_bbaddr; + /** @ring_dma_fadd_udw: RING_DMA_FADD_UDW */ + u32 ring_dma_fadd_udw; + /** @ring_dma_fadd: RING_DMA_FADD */ + u32 ring_dma_fadd; + /** @ipehr: IPEHR */ + u32 ipehr; + /** @rcu_mode: RCU_MODE */ + u32 rcu_mode; + } reg; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_hw_fence.c b/drivers/gpu/drm/xe/xe_hw_fence.c new file mode 100644 index 000000000000..a6094c81f2ad --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_fence.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_hw_fence.h" + +#include <linux/device.h> +#include <linux/slab.h> + +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_hw_engine.h" +#include "xe_macros.h" +#include "xe_map.h" +#include "xe_trace.h" + +static struct kmem_cache *xe_hw_fence_slab; + +int __init xe_hw_fence_module_init(void) +{ + xe_hw_fence_slab = kmem_cache_create("xe_hw_fence", + sizeof(struct xe_hw_fence), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!xe_hw_fence_slab) + return -ENOMEM; + + return 0; +} + +void xe_hw_fence_module_exit(void) +{ + rcu_barrier(); + kmem_cache_destroy(xe_hw_fence_slab); +} + +static struct xe_hw_fence *fence_alloc(void) +{ + return kmem_cache_zalloc(xe_hw_fence_slab, GFP_KERNEL); +} + +static void fence_free(struct rcu_head *rcu) +{ + struct xe_hw_fence *fence = + container_of(rcu, struct xe_hw_fence, dma.rcu); + + if (!WARN_ON_ONCE(!fence)) + kmem_cache_free(xe_hw_fence_slab, fence); +} + +static void hw_fence_irq_run_cb(struct irq_work *work) +{ + struct xe_hw_fence_irq *irq = container_of(work, typeof(*irq), work); + struct xe_hw_fence *fence, *next; + bool tmp; + + tmp = dma_fence_begin_signalling(); + spin_lock(&irq->lock); + if (irq->enabled) { + list_for_each_entry_safe(fence, next, &irq->pending, irq_link) { + struct dma_fence *dma_fence = &fence->dma; + + trace_xe_hw_fence_try_signal(fence); + if (dma_fence_is_signaled_locked(dma_fence)) { + trace_xe_hw_fence_signal(fence); + list_del_init(&fence->irq_link); + dma_fence_put(dma_fence); + } + } + } + spin_unlock(&irq->lock); + dma_fence_end_signalling(tmp); +} + +void xe_hw_fence_irq_init(struct xe_hw_fence_irq *irq) +{ + spin_lock_init(&irq->lock); + init_irq_work(&irq->work, hw_fence_irq_run_cb); + INIT_LIST_HEAD(&irq->pending); + irq->enabled = true; +} + +void xe_hw_fence_irq_finish(struct xe_hw_fence_irq *irq) +{ + struct xe_hw_fence *fence, *next; + unsigned long flags; + int err; + bool tmp; + + if (XE_WARN_ON(!list_empty(&irq->pending))) { + tmp = dma_fence_begin_signalling(); + spin_lock_irqsave(&irq->lock, flags); + list_for_each_entry_safe(fence, next, &irq->pending, irq_link) { + list_del_init(&fence->irq_link); + err = dma_fence_signal_locked(&fence->dma); + dma_fence_put(&fence->dma); + XE_WARN_ON(err); + } + spin_unlock_irqrestore(&irq->lock, flags); + dma_fence_end_signalling(tmp); + } +} + +void xe_hw_fence_irq_run(struct xe_hw_fence_irq *irq) +{ + irq_work_queue(&irq->work); +} + +void xe_hw_fence_irq_stop(struct xe_hw_fence_irq *irq) +{ + spin_lock_irq(&irq->lock); + irq->enabled = false; + spin_unlock_irq(&irq->lock); +} + +void xe_hw_fence_irq_start(struct xe_hw_fence_irq *irq) +{ + spin_lock_irq(&irq->lock); + irq->enabled = true; + spin_unlock_irq(&irq->lock); + + irq_work_queue(&irq->work); +} + +void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt, + struct xe_hw_fence_irq *irq, const char *name) +{ + ctx->gt = gt; + ctx->irq = irq; + ctx->dma_fence_ctx = dma_fence_context_alloc(1); + ctx->next_seqno = XE_FENCE_INITIAL_SEQNO; + sprintf(ctx->name, "%s", name); +} + +void xe_hw_fence_ctx_finish(struct xe_hw_fence_ctx *ctx) +{ +} + +static struct xe_hw_fence *to_xe_hw_fence(struct dma_fence *fence); + +static struct xe_hw_fence_irq *xe_hw_fence_irq(struct xe_hw_fence *fence) +{ + return container_of(fence->dma.lock, struct xe_hw_fence_irq, lock); +} + +static const char *xe_hw_fence_get_driver_name(struct dma_fence *dma_fence) +{ + struct xe_hw_fence *fence = to_xe_hw_fence(dma_fence); + + return dev_name(gt_to_xe(fence->ctx->gt)->drm.dev); +} + +static const char *xe_hw_fence_get_timeline_name(struct dma_fence *dma_fence) +{ + struct xe_hw_fence *fence = to_xe_hw_fence(dma_fence); + + return fence->ctx->name; +} + +static bool xe_hw_fence_signaled(struct dma_fence *dma_fence) +{ + struct xe_hw_fence *fence = to_xe_hw_fence(dma_fence); + struct xe_device *xe = gt_to_xe(fence->ctx->gt); + u32 seqno = xe_map_rd(xe, &fence->seqno_map, 0, u32); + + return dma_fence->error || + !__dma_fence_is_later(dma_fence->seqno, seqno, dma_fence->ops); +} + +static bool xe_hw_fence_enable_signaling(struct dma_fence *dma_fence) +{ + struct xe_hw_fence *fence = to_xe_hw_fence(dma_fence); + struct xe_hw_fence_irq *irq = xe_hw_fence_irq(fence); + + dma_fence_get(dma_fence); + list_add_tail(&fence->irq_link, &irq->pending); + + /* SW completed (no HW IRQ) so kick handler to signal fence */ + if (xe_hw_fence_signaled(dma_fence)) + xe_hw_fence_irq_run(irq); + + return true; +} + +static void xe_hw_fence_release(struct dma_fence *dma_fence) +{ + struct xe_hw_fence *fence = to_xe_hw_fence(dma_fence); + + trace_xe_hw_fence_free(fence); + XE_WARN_ON(!list_empty(&fence->irq_link)); + call_rcu(&dma_fence->rcu, fence_free); +} + +static const struct dma_fence_ops xe_hw_fence_ops = { + .get_driver_name = xe_hw_fence_get_driver_name, + .get_timeline_name = xe_hw_fence_get_timeline_name, + .enable_signaling = xe_hw_fence_enable_signaling, + .signaled = xe_hw_fence_signaled, + .release = xe_hw_fence_release, +}; + +static struct xe_hw_fence *to_xe_hw_fence(struct dma_fence *fence) +{ + if (XE_WARN_ON(fence->ops != &xe_hw_fence_ops)) + return NULL; + + return container_of(fence, struct xe_hw_fence, dma); +} + +struct xe_hw_fence *xe_hw_fence_create(struct xe_hw_fence_ctx *ctx, + struct iosys_map seqno_map) +{ + struct xe_hw_fence *fence; + + fence = fence_alloc(); + if (!fence) + return ERR_PTR(-ENOMEM); + + dma_fence_init(&fence->dma, &xe_hw_fence_ops, &ctx->irq->lock, + ctx->dma_fence_ctx, ctx->next_seqno++); + + fence->ctx = ctx; + fence->seqno_map = seqno_map; + INIT_LIST_HEAD(&fence->irq_link); + + trace_xe_hw_fence_create(fence); + + return fence; +} diff --git a/drivers/gpu/drm/xe/xe_hw_fence.h b/drivers/gpu/drm/xe/xe_hw_fence.h new file mode 100644 index 000000000000..cfe5fd603787 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_fence.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_HW_FENCE_H_ +#define _XE_HW_FENCE_H_ + +#include "xe_hw_fence_types.h" + +/* Cause an early wrap to catch wrapping errors */ +#define XE_FENCE_INITIAL_SEQNO (-127) + +int xe_hw_fence_module_init(void); +void xe_hw_fence_module_exit(void); + +void xe_hw_fence_irq_init(struct xe_hw_fence_irq *irq); +void xe_hw_fence_irq_finish(struct xe_hw_fence_irq *irq); +void xe_hw_fence_irq_run(struct xe_hw_fence_irq *irq); +void xe_hw_fence_irq_stop(struct xe_hw_fence_irq *irq); +void xe_hw_fence_irq_start(struct xe_hw_fence_irq *irq); + +void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt, + struct xe_hw_fence_irq *irq, const char *name); +void xe_hw_fence_ctx_finish(struct xe_hw_fence_ctx *ctx); + +struct xe_hw_fence *xe_hw_fence_create(struct xe_hw_fence_ctx *ctx, + struct iosys_map seqno_map); + +#endif diff --git a/drivers/gpu/drm/xe/xe_hw_fence_types.h b/drivers/gpu/drm/xe/xe_hw_fence_types.h new file mode 100644 index 000000000000..b33c4956e8ea --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hw_fence_types.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_HW_FENCE_TYPES_H_ +#define _XE_HW_FENCE_TYPES_H_ + +#include <linux/dma-fence.h> +#include <linux/iosys-map.h> +#include <linux/irq_work.h> +#include <linux/list.h> +#include <linux/spinlock.h> + +struct xe_gt; + +/** + * struct xe_hw_fence_irq - hardware fence IRQ handler + * + * One per engine class, signals completed xe_hw_fences, triggered via hw engine + * interrupt. On each trigger, search list of pending fences and signal. + */ +struct xe_hw_fence_irq { + /** @lock: protects all xe_hw_fences + pending list */ + spinlock_t lock; + /** @work: IRQ worker run to signal the fences */ + struct irq_work work; + /** @pending: list of pending xe_hw_fences */ + struct list_head pending; + /** @enabled: fence signaling enabled */ + bool enabled; +}; + +#define MAX_FENCE_NAME_LEN 16 + +/** + * struct xe_hw_fence_ctx - hardware fence context + * + * The context for a hardware fence. 1 to 1 relationship with xe_engine. Points + * to a xe_hw_fence_irq, maintains serial seqno. + */ +struct xe_hw_fence_ctx { + /** @gt: graphics tile of hardware fence context */ + struct xe_gt *gt; + /** @irq: fence irq handler */ + struct xe_hw_fence_irq *irq; + /** @dma_fence_ctx: dma fence context for hardware fence */ + u64 dma_fence_ctx; + /** @next_seqno: next seqno for hardware fence */ + u32 next_seqno; + /** @name: name of hardware fence context */ + char name[MAX_FENCE_NAME_LEN]; +}; + +/** + * struct xe_hw_fence - hardware fence + * + * Used to indicate a xe_sched_job is complete via a seqno written to memory. + * Signals on error or seqno past. + */ +struct xe_hw_fence { + /** @dma: base dma fence for hardware fence context */ + struct dma_fence dma; + /** @ctx: hardware fence context */ + struct xe_hw_fence_ctx *ctx; + /** @seqno_map: I/O map for seqno */ + struct iosys_map seqno_map; + /** @irq_link: Link in struct xe_hw_fence_irq.pending */ + struct list_head irq_link; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c new file mode 100644 index 000000000000..6ef2aa1eae8b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -0,0 +1,776 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/hwmon-sysfs.h> +#include <linux/hwmon.h> +#include <linux/types.h> + +#include <drm/drm_managed.h> +#include "regs/xe_gt_regs.h" +#include "regs/xe_mchbar_regs.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_hwmon.h" +#include "xe_mmio.h" +#include "xe_pcode.h" +#include "xe_pcode_api.h" + +enum xe_hwmon_reg { + REG_PKG_RAPL_LIMIT, + REG_PKG_POWER_SKU, + REG_PKG_POWER_SKU_UNIT, + REG_GT_PERF_STATUS, + REG_PKG_ENERGY_STATUS, +}; + +enum xe_hwmon_reg_operation { + REG_READ32, + REG_RMW32, + REG_READ64, +}; + +/* + * SF_* - scale factors for particular quantities according to hwmon spec. + */ +#define SF_POWER 1000000 /* microwatts */ +#define SF_CURR 1000 /* milliamperes */ +#define SF_VOLTAGE 1000 /* millivolts */ +#define SF_ENERGY 1000000 /* microjoules */ +#define SF_TIME 1000 /* milliseconds */ + +/** + * struct xe_hwmon_energy_info - to accumulate energy + */ +struct xe_hwmon_energy_info { + /** @reg_val_prev: previous energy reg val */ + u32 reg_val_prev; + /** @accum_energy: accumulated energy */ + long accum_energy; +}; + +/** + * struct xe_hwmon - xe hwmon data structure + */ +struct xe_hwmon { + /** @hwmon_dev: hwmon device for xe */ + struct device *hwmon_dev; + /** @gt: primary gt */ + struct xe_gt *gt; + /** @hwmon_lock: lock for rw attributes*/ + struct mutex hwmon_lock; + /** @scl_shift_power: pkg power unit */ + int scl_shift_power; + /** @scl_shift_energy: pkg energy unit */ + int scl_shift_energy; + /** @scl_shift_time: pkg time unit */ + int scl_shift_time; + /** @ei: Energy info for energy1_input */ + struct xe_hwmon_energy_info ei; +}; + +static u32 xe_hwmon_get_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg) +{ + struct xe_device *xe = gt_to_xe(hwmon->gt); + struct xe_reg reg = XE_REG(0); + + switch (hwmon_reg) { + case REG_PKG_RAPL_LIMIT: + if (xe->info.platform == XE_DG2) + reg = PCU_CR_PACKAGE_RAPL_LIMIT; + else if (xe->info.platform == XE_PVC) + reg = PVC_GT0_PACKAGE_RAPL_LIMIT; + break; + case REG_PKG_POWER_SKU: + if (xe->info.platform == XE_DG2) + reg = PCU_CR_PACKAGE_POWER_SKU; + else if (xe->info.platform == XE_PVC) + reg = PVC_GT0_PACKAGE_POWER_SKU; + break; + case REG_PKG_POWER_SKU_UNIT: + if (xe->info.platform == XE_DG2) + reg = PCU_CR_PACKAGE_POWER_SKU_UNIT; + else if (xe->info.platform == XE_PVC) + reg = PVC_GT0_PACKAGE_POWER_SKU_UNIT; + break; + case REG_GT_PERF_STATUS: + if (xe->info.platform == XE_DG2) + reg = GT_PERF_STATUS; + break; + case REG_PKG_ENERGY_STATUS: + if (xe->info.platform == XE_DG2) + reg = PCU_CR_PACKAGE_ENERGY_STATUS; + else if (xe->info.platform == XE_PVC) + reg = PVC_GT0_PLATFORM_ENERGY_STATUS; + break; + default: + drm_warn(&xe->drm, "Unknown xe hwmon reg id: %d\n", hwmon_reg); + break; + } + + return reg.raw; +} + +static void xe_hwmon_process_reg(struct xe_hwmon *hwmon, enum xe_hwmon_reg hwmon_reg, + enum xe_hwmon_reg_operation operation, u64 *value, + u32 clr, u32 set) +{ + struct xe_reg reg; + + reg.raw = xe_hwmon_get_reg(hwmon, hwmon_reg); + + if (!reg.raw) + return; + + switch (operation) { + case REG_READ32: + *value = xe_mmio_read32(hwmon->gt, reg); + break; + case REG_RMW32: + *value = xe_mmio_rmw32(hwmon->gt, reg, clr, set); + break; + case REG_READ64: + *value = xe_mmio_read64_2x32(hwmon->gt, reg); + break; + default: + drm_warn(>_to_xe(hwmon->gt)->drm, "Invalid xe hwmon reg operation: %d\n", + operation); + break; + } +} + +#define PL1_DISABLE 0 + +/* + * HW allows arbitrary PL1 limits to be set but silently clamps these values to + * "typical but not guaranteed" min/max values in REG_PKG_POWER_SKU. Follow the + * same pattern for sysfs, allow arbitrary PL1 limits to be set but display + * clamped values when read. + */ +static void xe_hwmon_power_max_read(struct xe_hwmon *hwmon, long *value) +{ + u64 reg_val, min, max; + + mutex_lock(&hwmon->hwmon_lock); + + xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ32, ®_val, 0, 0); + /* Check if PL1 limit is disabled */ + if (!(reg_val & PKG_PWR_LIM_1_EN)) { + *value = PL1_DISABLE; + goto unlock; + } + + reg_val = REG_FIELD_GET(PKG_PWR_LIM_1, reg_val); + *value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power); + + xe_hwmon_process_reg(hwmon, REG_PKG_POWER_SKU, REG_READ64, ®_val, 0, 0); + min = REG_FIELD_GET(PKG_MIN_PWR, reg_val); + min = mul_u64_u32_shr(min, SF_POWER, hwmon->scl_shift_power); + max = REG_FIELD_GET(PKG_MAX_PWR, reg_val); + max = mul_u64_u32_shr(max, SF_POWER, hwmon->scl_shift_power); + + if (min && max) + *value = clamp_t(u64, *value, min, max); +unlock: + mutex_unlock(&hwmon->hwmon_lock); +} + +static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, long value) +{ + int ret = 0; + u64 reg_val; + + mutex_lock(&hwmon->hwmon_lock); + + /* Disable PL1 limit and verify, as limit cannot be disabled on all platforms */ + if (value == PL1_DISABLE) { + xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW32, ®_val, + PKG_PWR_LIM_1_EN, 0); + xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_READ32, ®_val, + PKG_PWR_LIM_1_EN, 0); + + if (reg_val & PKG_PWR_LIM_1_EN) { + ret = -EOPNOTSUPP; + goto unlock; + } + } + + /* Computation in 64-bits to avoid overflow. Round to nearest. */ + reg_val = DIV_ROUND_CLOSEST_ULL((u64)value << hwmon->scl_shift_power, SF_POWER); + reg_val = PKG_PWR_LIM_1_EN | REG_FIELD_PREP(PKG_PWR_LIM_1, reg_val); + + xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW32, ®_val, + PKG_PWR_LIM_1_EN | PKG_PWR_LIM_1, reg_val); +unlock: + mutex_unlock(&hwmon->hwmon_lock); + return ret; +} + +static void xe_hwmon_power_rated_max_read(struct xe_hwmon *hwmon, long *value) +{ + u64 reg_val; + + xe_hwmon_process_reg(hwmon, REG_PKG_POWER_SKU, REG_READ32, ®_val, 0, 0); + reg_val = REG_FIELD_GET(PKG_TDP, reg_val); + *value = mul_u64_u32_shr(reg_val, SF_POWER, hwmon->scl_shift_power); +} + +/* + * xe_hwmon_energy_get - Obtain energy value + * + * The underlying energy hardware register is 32-bits and is subject to + * overflow. How long before overflow? For example, with an example + * scaling bit shift of 14 bits (see register *PACKAGE_POWER_SKU_UNIT) and + * a power draw of 1000 watts, the 32-bit counter will overflow in + * approximately 4.36 minutes. + * + * Examples: + * 1 watt: (2^32 >> 14) / 1 W / (60 * 60 * 24) secs/day -> 3 days + * 1000 watts: (2^32 >> 14) / 1000 W / 60 secs/min -> 4.36 minutes + * + * The function significantly increases overflow duration (from 4.36 + * minutes) by accumulating the energy register into a 'long' as allowed by + * the hwmon API. Using x86_64 128 bit arithmetic (see mul_u64_u32_shr()), + * a 'long' of 63 bits, SF_ENERGY of 1e6 (~20 bits) and + * hwmon->scl_shift_energy of 14 bits we have 57 (63 - 20 + 14) bits before + * energy1_input overflows. This at 1000 W is an overflow duration of 278 years. + */ +static void +xe_hwmon_energy_get(struct xe_hwmon *hwmon, long *energy) +{ + struct xe_hwmon_energy_info *ei = &hwmon->ei; + u64 reg_val; + + xe_hwmon_process_reg(hwmon, REG_PKG_ENERGY_STATUS, REG_READ32, + ®_val, 0, 0); + + if (reg_val >= ei->reg_val_prev) + ei->accum_energy += reg_val - ei->reg_val_prev; + else + ei->accum_energy += UINT_MAX - ei->reg_val_prev + reg_val; + + ei->reg_val_prev = reg_val; + + *energy = mul_u64_u32_shr(ei->accum_energy, SF_ENERGY, + hwmon->scl_shift_energy); +} + +static ssize_t +xe_hwmon_power1_max_interval_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct xe_hwmon *hwmon = dev_get_drvdata(dev); + u32 x, y, x_w = 2; /* 2 bits */ + u64 r, tau4, out; + + xe_device_mem_access_get(gt_to_xe(hwmon->gt)); + + mutex_lock(&hwmon->hwmon_lock); + + xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, + REG_READ32, &r, 0, 0); + + mutex_unlock(&hwmon->hwmon_lock); + + xe_device_mem_access_put(gt_to_xe(hwmon->gt)); + + x = REG_FIELD_GET(PKG_PWR_LIM_1_TIME_X, r); + y = REG_FIELD_GET(PKG_PWR_LIM_1_TIME_Y, r); + + /* + * tau = 1.x * power(2,y), x = bits(23:22), y = bits(21:17) + * = (4 | x) << (y - 2) + * + * Here (y - 2) ensures a 1.x fixed point representation of 1.x + * As x is 2 bits so 1.x can be 1.0, 1.25, 1.50, 1.75 + * + * As y can be < 2, we compute tau4 = (4 | x) << y + * and then add 2 when doing the final right shift to account for units + */ + tau4 = ((1 << x_w) | x) << y; + + /* val in hwmon interface units (millisec) */ + out = mul_u64_u32_shr(tau4, SF_TIME, hwmon->scl_shift_time + x_w); + + return sysfs_emit(buf, "%llu\n", out); +} + +static ssize_t +xe_hwmon_power1_max_interval_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct xe_hwmon *hwmon = dev_get_drvdata(dev); + u32 x, y, rxy, x_w = 2; /* 2 bits */ + u64 tau4, r, max_win; + unsigned long val; + int ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + /* + * Max HW supported tau in '1.x * power(2,y)' format, x = 0, y = 0x12. + * The hwmon->scl_shift_time default of 0xa results in a max tau of 256 seconds. + * + * The ideal scenario is for PKG_MAX_WIN to be read from the PKG_PWR_SKU register. + * However, it is observed that existing discrete GPUs does not provide correct + * PKG_MAX_WIN value, therefore a using default constant value. For future discrete GPUs + * this may get resolved, in which case PKG_MAX_WIN should be obtained from PKG_PWR_SKU. + */ +#define PKG_MAX_WIN_DEFAULT 0x12ull + + /* + * val must be < max in hwmon interface units. The steps below are + * explained in xe_hwmon_power1_max_interval_show() + */ + r = FIELD_PREP(PKG_MAX_WIN, PKG_MAX_WIN_DEFAULT); + x = REG_FIELD_GET(PKG_MAX_WIN_X, r); + y = REG_FIELD_GET(PKG_MAX_WIN_Y, r); + tau4 = ((1 << x_w) | x) << y; + max_win = mul_u64_u32_shr(tau4, SF_TIME, hwmon->scl_shift_time + x_w); + + if (val > max_win) + return -EINVAL; + + /* val in hw units */ + val = DIV_ROUND_CLOSEST_ULL((u64)val << hwmon->scl_shift_time, SF_TIME); + + /* + * Convert val to 1.x * power(2,y) + * y = ilog2(val) + * x = (val - (1 << y)) >> (y - 2) + */ + if (!val) { + y = 0; + x = 0; + } else { + y = ilog2(val); + x = (val - (1ul << y)) << x_w >> y; + } + + rxy = REG_FIELD_PREP(PKG_PWR_LIM_1_TIME_X, x) | REG_FIELD_PREP(PKG_PWR_LIM_1_TIME_Y, y); + + xe_device_mem_access_get(gt_to_xe(hwmon->gt)); + + mutex_lock(&hwmon->hwmon_lock); + + xe_hwmon_process_reg(hwmon, REG_PKG_RAPL_LIMIT, REG_RMW32, (u64 *)&r, + PKG_PWR_LIM_1_TIME, rxy); + + mutex_unlock(&hwmon->hwmon_lock); + + xe_device_mem_access_put(gt_to_xe(hwmon->gt)); + + return count; +} + +static SENSOR_DEVICE_ATTR(power1_max_interval, 0664, + xe_hwmon_power1_max_interval_show, + xe_hwmon_power1_max_interval_store, 0); + +static struct attribute *hwmon_attributes[] = { + &sensor_dev_attr_power1_max_interval.dev_attr.attr, + NULL +}; + +static umode_t xe_hwmon_attributes_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + struct device *dev = kobj_to_dev(kobj); + struct xe_hwmon *hwmon = dev_get_drvdata(dev); + int ret = 0; + + xe_device_mem_access_get(gt_to_xe(hwmon->gt)); + + if (attr == &sensor_dev_attr_power1_max_interval.dev_attr.attr) + ret = xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT) ? attr->mode : 0; + + xe_device_mem_access_put(gt_to_xe(hwmon->gt)); + + return ret; +} + +static const struct attribute_group hwmon_attrgroup = { + .attrs = hwmon_attributes, + .is_visible = xe_hwmon_attributes_visible, +}; + +static const struct attribute_group *hwmon_groups[] = { + &hwmon_attrgroup, + NULL +}; + +static const struct hwmon_channel_info *hwmon_info[] = { + HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_CRIT), + HWMON_CHANNEL_INFO(curr, HWMON_C_CRIT), + HWMON_CHANNEL_INFO(in, HWMON_I_INPUT), + HWMON_CHANNEL_INFO(energy, HWMON_E_INPUT), + NULL +}; + +/* I1 is exposed as power_crit or as curr_crit depending on bit 31 */ +static int xe_hwmon_pcode_read_i1(struct xe_gt *gt, u32 *uval) +{ + /* Avoid Illegal Subcommand error */ + if (gt_to_xe(gt)->info.platform == XE_DG2) + return -ENXIO; + + return xe_pcode_read(gt, PCODE_MBOX(PCODE_POWER_SETUP, + POWER_SETUP_SUBCOMMAND_READ_I1, 0), + uval, 0); +} + +static int xe_hwmon_pcode_write_i1(struct xe_gt *gt, u32 uval) +{ + return xe_pcode_write(gt, PCODE_MBOX(PCODE_POWER_SETUP, + POWER_SETUP_SUBCOMMAND_WRITE_I1, 0), + uval); +} + +static int xe_hwmon_power_curr_crit_read(struct xe_hwmon *hwmon, long *value, u32 scale_factor) +{ + int ret; + u32 uval; + + mutex_lock(&hwmon->hwmon_lock); + + ret = xe_hwmon_pcode_read_i1(hwmon->gt, &uval); + if (ret) + goto unlock; + + *value = mul_u64_u32_shr(REG_FIELD_GET(POWER_SETUP_I1_DATA_MASK, uval), + scale_factor, POWER_SETUP_I1_SHIFT); +unlock: + mutex_unlock(&hwmon->hwmon_lock); + return ret; +} + +static int xe_hwmon_power_curr_crit_write(struct xe_hwmon *hwmon, long value, u32 scale_factor) +{ + int ret; + u32 uval; + + mutex_lock(&hwmon->hwmon_lock); + + uval = DIV_ROUND_CLOSEST_ULL(value << POWER_SETUP_I1_SHIFT, scale_factor); + ret = xe_hwmon_pcode_write_i1(hwmon->gt, uval); + + mutex_unlock(&hwmon->hwmon_lock); + return ret; +} + +static void xe_hwmon_get_voltage(struct xe_hwmon *hwmon, long *value) +{ + u64 reg_val; + + xe_hwmon_process_reg(hwmon, REG_GT_PERF_STATUS, + REG_READ32, ®_val, 0, 0); + /* HW register value in units of 2.5 millivolt */ + *value = DIV_ROUND_CLOSEST(REG_FIELD_GET(VOLTAGE_MASK, reg_val) * 2500, SF_VOLTAGE); +} + +static umode_t +xe_hwmon_power_is_visible(struct xe_hwmon *hwmon, u32 attr, int chan) +{ + u32 uval; + + switch (attr) { + case hwmon_power_max: + return xe_hwmon_get_reg(hwmon, REG_PKG_RAPL_LIMIT) ? 0664 : 0; + case hwmon_power_rated_max: + return xe_hwmon_get_reg(hwmon, REG_PKG_POWER_SKU) ? 0444 : 0; + case hwmon_power_crit: + return (xe_hwmon_pcode_read_i1(hwmon->gt, &uval) || + !(uval & POWER_SETUP_I1_WATTS)) ? 0 : 0644; + default: + return 0; + } +} + +static int +xe_hwmon_power_read(struct xe_hwmon *hwmon, u32 attr, int chan, long *val) +{ + switch (attr) { + case hwmon_power_max: + xe_hwmon_power_max_read(hwmon, val); + return 0; + case hwmon_power_rated_max: + xe_hwmon_power_rated_max_read(hwmon, val); + return 0; + case hwmon_power_crit: + return xe_hwmon_power_curr_crit_read(hwmon, val, SF_POWER); + default: + return -EOPNOTSUPP; + } +} + +static int +xe_hwmon_power_write(struct xe_hwmon *hwmon, u32 attr, int chan, long val) +{ + switch (attr) { + case hwmon_power_max: + return xe_hwmon_power_max_write(hwmon, val); + case hwmon_power_crit: + return xe_hwmon_power_curr_crit_write(hwmon, val, SF_POWER); + default: + return -EOPNOTSUPP; + } +} + +static umode_t +xe_hwmon_curr_is_visible(const struct xe_hwmon *hwmon, u32 attr) +{ + u32 uval; + + switch (attr) { + case hwmon_curr_crit: + return (xe_hwmon_pcode_read_i1(hwmon->gt, &uval) || + (uval & POWER_SETUP_I1_WATTS)) ? 0 : 0644; + default: + return 0; + } +} + +static int +xe_hwmon_curr_read(struct xe_hwmon *hwmon, u32 attr, long *val) +{ + switch (attr) { + case hwmon_curr_crit: + return xe_hwmon_power_curr_crit_read(hwmon, val, SF_CURR); + default: + return -EOPNOTSUPP; + } +} + +static int +xe_hwmon_curr_write(struct xe_hwmon *hwmon, u32 attr, long val) +{ + switch (attr) { + case hwmon_curr_crit: + return xe_hwmon_power_curr_crit_write(hwmon, val, SF_CURR); + default: + return -EOPNOTSUPP; + } +} + +static umode_t +xe_hwmon_in_is_visible(struct xe_hwmon *hwmon, u32 attr) +{ + switch (attr) { + case hwmon_in_input: + return xe_hwmon_get_reg(hwmon, REG_GT_PERF_STATUS) ? 0444 : 0; + default: + return 0; + } +} + +static int +xe_hwmon_in_read(struct xe_hwmon *hwmon, u32 attr, long *val) +{ + switch (attr) { + case hwmon_in_input: + xe_hwmon_get_voltage(hwmon, val); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static umode_t +xe_hwmon_energy_is_visible(struct xe_hwmon *hwmon, u32 attr) +{ + switch (attr) { + case hwmon_energy_input: + return xe_hwmon_get_reg(hwmon, REG_PKG_ENERGY_STATUS) ? 0444 : 0; + default: + return 0; + } +} + +static int +xe_hwmon_energy_read(struct xe_hwmon *hwmon, u32 attr, long *val) +{ + switch (attr) { + case hwmon_energy_input: + xe_hwmon_energy_get(hwmon, val); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static umode_t +xe_hwmon_is_visible(const void *drvdata, enum hwmon_sensor_types type, + u32 attr, int channel) +{ + struct xe_hwmon *hwmon = (struct xe_hwmon *)drvdata; + int ret; + + xe_device_mem_access_get(gt_to_xe(hwmon->gt)); + + switch (type) { + case hwmon_power: + ret = xe_hwmon_power_is_visible(hwmon, attr, channel); + break; + case hwmon_curr: + ret = xe_hwmon_curr_is_visible(hwmon, attr); + break; + case hwmon_in: + ret = xe_hwmon_in_is_visible(hwmon, attr); + break; + case hwmon_energy: + ret = xe_hwmon_energy_is_visible(hwmon, attr); + break; + default: + ret = 0; + break; + } + + xe_device_mem_access_put(gt_to_xe(hwmon->gt)); + + return ret; +} + +static int +xe_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, + int channel, long *val) +{ + struct xe_hwmon *hwmon = dev_get_drvdata(dev); + int ret; + + xe_device_mem_access_get(gt_to_xe(hwmon->gt)); + + switch (type) { + case hwmon_power: + ret = xe_hwmon_power_read(hwmon, attr, channel, val); + break; + case hwmon_curr: + ret = xe_hwmon_curr_read(hwmon, attr, val); + break; + case hwmon_in: + ret = xe_hwmon_in_read(hwmon, attr, val); + break; + case hwmon_energy: + ret = xe_hwmon_energy_read(hwmon, attr, val); + break; + default: + ret = -EOPNOTSUPP; + break; + } + + xe_device_mem_access_put(gt_to_xe(hwmon->gt)); + + return ret; +} + +static int +xe_hwmon_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, + int channel, long val) +{ + struct xe_hwmon *hwmon = dev_get_drvdata(dev); + int ret; + + xe_device_mem_access_get(gt_to_xe(hwmon->gt)); + + switch (type) { + case hwmon_power: + ret = xe_hwmon_power_write(hwmon, attr, channel, val); + break; + case hwmon_curr: + ret = xe_hwmon_curr_write(hwmon, attr, val); + break; + default: + ret = -EOPNOTSUPP; + break; + } + + xe_device_mem_access_put(gt_to_xe(hwmon->gt)); + + return ret; +} + +static const struct hwmon_ops hwmon_ops = { + .is_visible = xe_hwmon_is_visible, + .read = xe_hwmon_read, + .write = xe_hwmon_write, +}; + +static const struct hwmon_chip_info hwmon_chip_info = { + .ops = &hwmon_ops, + .info = hwmon_info, +}; + +static void +xe_hwmon_get_preregistration_info(struct xe_device *xe) +{ + struct xe_hwmon *hwmon = xe->hwmon; + long energy; + u64 val_sku_unit = 0; + + /* + * The contents of register PKG_POWER_SKU_UNIT do not change, + * so read it once and store the shift values. + */ + if (xe_hwmon_get_reg(hwmon, REG_PKG_POWER_SKU_UNIT)) { + xe_hwmon_process_reg(hwmon, REG_PKG_POWER_SKU_UNIT, + REG_READ32, &val_sku_unit, 0, 0); + hwmon->scl_shift_power = REG_FIELD_GET(PKG_PWR_UNIT, val_sku_unit); + hwmon->scl_shift_energy = REG_FIELD_GET(PKG_ENERGY_UNIT, val_sku_unit); + hwmon->scl_shift_time = REG_FIELD_GET(PKG_TIME_UNIT, val_sku_unit); + } + + /* + * Initialize 'struct xe_hwmon_energy_info', i.e. set fields to the + * first value of the energy register read + */ + if (xe_hwmon_is_visible(hwmon, hwmon_energy, hwmon_energy_input, 0)) + xe_hwmon_energy_get(hwmon, &energy); +} + +static void xe_hwmon_mutex_destroy(void *arg) +{ + struct xe_hwmon *hwmon = arg; + + mutex_destroy(&hwmon->hwmon_lock); +} + +void xe_hwmon_register(struct xe_device *xe) +{ + struct device *dev = xe->drm.dev; + struct xe_hwmon *hwmon; + + /* hwmon is available only for dGfx */ + if (!IS_DGFX(xe)) + return; + + hwmon = devm_kzalloc(dev, sizeof(*hwmon), GFP_KERNEL); + if (!hwmon) + return; + + xe->hwmon = hwmon; + + mutex_init(&hwmon->hwmon_lock); + if (devm_add_action_or_reset(dev, xe_hwmon_mutex_destroy, hwmon)) + return; + + /* primary GT to access device level properties */ + hwmon->gt = xe->tiles[0].primary_gt; + + xe_hwmon_get_preregistration_info(xe); + + drm_dbg(&xe->drm, "Register xe hwmon interface\n"); + + /* hwmon_dev points to device hwmon<i> */ + hwmon->hwmon_dev = devm_hwmon_device_register_with_info(dev, "xe", hwmon, + &hwmon_chip_info, + hwmon_groups); + + if (IS_ERR(hwmon->hwmon_dev)) { + drm_warn(&xe->drm, "Failed to register xe hwmon (%pe)\n", hwmon->hwmon_dev); + xe->hwmon = NULL; + return; + } +} + diff --git a/drivers/gpu/drm/xe/xe_hwmon.h b/drivers/gpu/drm/xe/xe_hwmon.h new file mode 100644 index 000000000000..c42a1de2cd7a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_hwmon.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_HWMON_H_ +#define _XE_HWMON_H_ + +#include <linux/types.h> + +struct xe_device; + +#if IS_REACHABLE(CONFIG_HWMON) +void xe_hwmon_register(struct xe_device *xe); +#else +static inline void xe_hwmon_register(struct xe_device *xe) { }; +#endif + +#endif /* _XE_HWMON_H_ */ diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c new file mode 100644 index 000000000000..d1f5ba4bb745 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_irq.c @@ -0,0 +1,666 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_irq.h" + +#include <linux/sched/clock.h> + +#include <drm/drm_managed.h> + +#include "regs/xe_gt_regs.h" +#include "regs/xe_regs.h" +#include "xe_device.h" +#include "xe_display.h" +#include "xe_drv.h" +#include "xe_gt.h" +#include "xe_guc.h" +#include "xe_hw_engine.h" +#include "xe_mmio.h" + +/* + * Interrupt registers for a unit are always consecutive and ordered + * ISR, IMR, IIR, IER. + */ +#define IMR(offset) XE_REG(offset + 0x4) +#define IIR(offset) XE_REG(offset + 0x8) +#define IER(offset) XE_REG(offset + 0xc) + +static void assert_iir_is_zero(struct xe_gt *mmio, struct xe_reg reg) +{ + u32 val = xe_mmio_read32(mmio, reg); + + if (val == 0) + return; + + drm_WARN(>_to_xe(mmio)->drm, 1, + "Interrupt register 0x%x is not zero: 0x%08x\n", + reg.addr, val); + xe_mmio_write32(mmio, reg, 0xffffffff); + xe_mmio_read32(mmio, reg); + xe_mmio_write32(mmio, reg, 0xffffffff); + xe_mmio_read32(mmio, reg); +} + +/* + * Unmask and enable the specified interrupts. Does not check current state, + * so any bits not specified here will become masked and disabled. + */ +static void unmask_and_enable(struct xe_tile *tile, u32 irqregs, u32 bits) +{ + struct xe_gt *mmio = tile->primary_gt; + + /* + * If we're just enabling an interrupt now, it shouldn't already + * be raised in the IIR. + */ + assert_iir_is_zero(mmio, IIR(irqregs)); + + xe_mmio_write32(mmio, IER(irqregs), bits); + xe_mmio_write32(mmio, IMR(irqregs), ~bits); + + /* Posting read */ + xe_mmio_read32(mmio, IMR(irqregs)); +} + +/* Mask and disable all interrupts. */ +static void mask_and_disable(struct xe_tile *tile, u32 irqregs) +{ + struct xe_gt *mmio = tile->primary_gt; + + xe_mmio_write32(mmio, IMR(irqregs), ~0); + /* Posting read */ + xe_mmio_read32(mmio, IMR(irqregs)); + + xe_mmio_write32(mmio, IER(irqregs), 0); + + /* IIR can theoretically queue up two events. Be paranoid. */ + xe_mmio_write32(mmio, IIR(irqregs), ~0); + xe_mmio_read32(mmio, IIR(irqregs)); + xe_mmio_write32(mmio, IIR(irqregs), ~0); + xe_mmio_read32(mmio, IIR(irqregs)); +} + +static u32 xelp_intr_disable(struct xe_device *xe) +{ + struct xe_gt *mmio = xe_root_mmio_gt(xe); + + xe_mmio_write32(mmio, GFX_MSTR_IRQ, 0); + + /* + * Now with master disabled, get a sample of level indications + * for this interrupt. Indications will be cleared on related acks. + * New indications can and will light up during processing, + * and will generate new interrupt after enabling master. + */ + return xe_mmio_read32(mmio, GFX_MSTR_IRQ); +} + +static u32 +gu_misc_irq_ack(struct xe_device *xe, const u32 master_ctl) +{ + struct xe_gt *mmio = xe_root_mmio_gt(xe); + u32 iir; + + if (!(master_ctl & GU_MISC_IRQ)) + return 0; + + iir = xe_mmio_read32(mmio, IIR(GU_MISC_IRQ_OFFSET)); + if (likely(iir)) + xe_mmio_write32(mmio, IIR(GU_MISC_IRQ_OFFSET), iir); + + return iir; +} + +static inline void xelp_intr_enable(struct xe_device *xe, bool stall) +{ + struct xe_gt *mmio = xe_root_mmio_gt(xe); + + xe_mmio_write32(mmio, GFX_MSTR_IRQ, MASTER_IRQ); + if (stall) + xe_mmio_read32(mmio, GFX_MSTR_IRQ); +} + +/* Enable/unmask the HWE interrupts for a specific GT's engines. */ +void xe_irq_enable_hwe(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 ccs_mask, bcs_mask; + u32 irqs, dmask, smask; + u32 gsc_mask = 0; + + if (xe_device_uc_enabled(xe)) { + irqs = GT_RENDER_USER_INTERRUPT | + GT_RENDER_PIPECTL_NOTIFY_INTERRUPT; + } else { + irqs = GT_RENDER_USER_INTERRUPT | + GT_CS_MASTER_ERROR_INTERRUPT | + GT_CONTEXT_SWITCH_INTERRUPT | + GT_WAIT_SEMAPHORE_INTERRUPT; + } + + ccs_mask = xe_hw_engine_mask_per_class(gt, XE_ENGINE_CLASS_COMPUTE); + bcs_mask = xe_hw_engine_mask_per_class(gt, XE_ENGINE_CLASS_COPY); + + dmask = irqs << 16 | irqs; + smask = irqs << 16; + + if (!xe_gt_is_media_type(gt)) { + /* Enable interrupts for each engine class */ + xe_mmio_write32(gt, RENDER_COPY_INTR_ENABLE, dmask); + if (ccs_mask) + xe_mmio_write32(gt, CCS_RSVD_INTR_ENABLE, smask); + + /* Unmask interrupts for each engine instance */ + xe_mmio_write32(gt, RCS0_RSVD_INTR_MASK, ~smask); + xe_mmio_write32(gt, BCS_RSVD_INTR_MASK, ~smask); + if (bcs_mask & (BIT(1)|BIT(2))) + xe_mmio_write32(gt, XEHPC_BCS1_BCS2_INTR_MASK, ~dmask); + if (bcs_mask & (BIT(3)|BIT(4))) + xe_mmio_write32(gt, XEHPC_BCS3_BCS4_INTR_MASK, ~dmask); + if (bcs_mask & (BIT(5)|BIT(6))) + xe_mmio_write32(gt, XEHPC_BCS5_BCS6_INTR_MASK, ~dmask); + if (bcs_mask & (BIT(7)|BIT(8))) + xe_mmio_write32(gt, XEHPC_BCS7_BCS8_INTR_MASK, ~dmask); + if (ccs_mask & (BIT(0)|BIT(1))) + xe_mmio_write32(gt, CCS0_CCS1_INTR_MASK, ~dmask); + if (ccs_mask & (BIT(2)|BIT(3))) + xe_mmio_write32(gt, CCS2_CCS3_INTR_MASK, ~dmask); + } + + if (xe_gt_is_media_type(gt) || MEDIA_VER(xe) < 13) { + /* Enable interrupts for each engine class */ + xe_mmio_write32(gt, VCS_VECS_INTR_ENABLE, dmask); + + /* Unmask interrupts for each engine instance */ + xe_mmio_write32(gt, VCS0_VCS1_INTR_MASK, ~dmask); + xe_mmio_write32(gt, VCS2_VCS3_INTR_MASK, ~dmask); + xe_mmio_write32(gt, VECS0_VECS1_INTR_MASK, ~dmask); + + if (xe_hw_engine_mask_per_class(gt, XE_ENGINE_CLASS_OTHER)) + gsc_mask = irqs; + else if (HAS_HECI_GSCFI(xe)) + gsc_mask = GSC_IRQ_INTF(1); + if (gsc_mask) { + xe_mmio_write32(gt, GUNIT_GSC_INTR_ENABLE, gsc_mask); + xe_mmio_write32(gt, GUNIT_GSC_INTR_MASK, ~gsc_mask); + } + } +} + +static u32 +gt_engine_identity(struct xe_device *xe, + struct xe_gt *mmio, + const unsigned int bank, + const unsigned int bit) +{ + u32 timeout_ts; + u32 ident; + + lockdep_assert_held(&xe->irq.lock); + + xe_mmio_write32(mmio, IIR_REG_SELECTOR(bank), BIT(bit)); + + /* + * NB: Specs do not specify how long to spin wait, + * so we do ~100us as an educated guess. + */ + timeout_ts = (local_clock() >> 10) + 100; + do { + ident = xe_mmio_read32(mmio, INTR_IDENTITY_REG(bank)); + } while (!(ident & INTR_DATA_VALID) && + !time_after32(local_clock() >> 10, timeout_ts)); + + if (unlikely(!(ident & INTR_DATA_VALID))) { + drm_err(&xe->drm, "INTR_IDENTITY_REG%u:%u 0x%08x not valid!\n", + bank, bit, ident); + return 0; + } + + xe_mmio_write32(mmio, INTR_IDENTITY_REG(bank), ident); + + return ident; +} + +#define OTHER_MEDIA_GUC_INSTANCE 16 + +static void +gt_other_irq_handler(struct xe_gt *gt, const u8 instance, const u16 iir) +{ + if (instance == OTHER_GUC_INSTANCE && !xe_gt_is_media_type(gt)) + return xe_guc_irq_handler(>->uc.guc, iir); + if (instance == OTHER_MEDIA_GUC_INSTANCE && xe_gt_is_media_type(gt)) + return xe_guc_irq_handler(>->uc.guc, iir); + + if (instance != OTHER_GUC_INSTANCE && + instance != OTHER_MEDIA_GUC_INSTANCE) { + WARN_ONCE(1, "unhandled other interrupt instance=0x%x, iir=0x%x\n", + instance, iir); + } +} + +static struct xe_gt *pick_engine_gt(struct xe_tile *tile, + enum xe_engine_class class, + unsigned int instance) +{ + struct xe_device *xe = tile_to_xe(tile); + + if (MEDIA_VER(xe) < 13) + return tile->primary_gt; + + if (class == XE_ENGINE_CLASS_VIDEO_DECODE || + class == XE_ENGINE_CLASS_VIDEO_ENHANCE) + return tile->media_gt; + + if (class == XE_ENGINE_CLASS_OTHER && + (instance == OTHER_MEDIA_GUC_INSTANCE || instance == OTHER_GSC_INSTANCE)) + return tile->media_gt; + + return tile->primary_gt; +} + +static void gt_irq_handler(struct xe_tile *tile, + u32 master_ctl, unsigned long *intr_dw, + u32 *identity) +{ + struct xe_device *xe = tile_to_xe(tile); + struct xe_gt *mmio = tile->primary_gt; + unsigned int bank, bit; + u16 instance, intr_vec; + enum xe_engine_class class; + struct xe_hw_engine *hwe; + + spin_lock(&xe->irq.lock); + + for (bank = 0; bank < 2; bank++) { + if (!(master_ctl & GT_DW_IRQ(bank))) + continue; + + intr_dw[bank] = xe_mmio_read32(mmio, GT_INTR_DW(bank)); + for_each_set_bit(bit, intr_dw + bank, 32) + identity[bit] = gt_engine_identity(xe, mmio, bank, bit); + xe_mmio_write32(mmio, GT_INTR_DW(bank), intr_dw[bank]); + + for_each_set_bit(bit, intr_dw + bank, 32) { + struct xe_gt *engine_gt; + + class = INTR_ENGINE_CLASS(identity[bit]); + instance = INTR_ENGINE_INSTANCE(identity[bit]); + intr_vec = INTR_ENGINE_INTR(identity[bit]); + + engine_gt = pick_engine_gt(tile, class, instance); + + hwe = xe_gt_hw_engine(engine_gt, class, instance, false); + if (hwe) { + xe_hw_engine_handle_irq(hwe, intr_vec); + continue; + } + + if (class == XE_ENGINE_CLASS_OTHER) { + /* HECI GSCFI interrupts come from outside of GT */ + if (HAS_HECI_GSCFI(xe) && instance == OTHER_GSC_INSTANCE) + xe_heci_gsc_irq_handler(xe, intr_vec); + else + gt_other_irq_handler(engine_gt, instance, intr_vec); + continue; + } + } + } + + spin_unlock(&xe->irq.lock); +} + +/* + * Top-level interrupt handler for Xe_LP platforms (which did not have + * a "master tile" interrupt register. + */ +static irqreturn_t xelp_irq_handler(int irq, void *arg) +{ + struct xe_device *xe = arg; + struct xe_tile *tile = xe_device_get_root_tile(xe); + u32 master_ctl, gu_misc_iir; + unsigned long intr_dw[2]; + u32 identity[32]; + + spin_lock(&xe->irq.lock); + if (!xe->irq.enabled) { + spin_unlock(&xe->irq.lock); + return IRQ_NONE; + } + spin_unlock(&xe->irq.lock); + + master_ctl = xelp_intr_disable(xe); + if (!master_ctl) { + xelp_intr_enable(xe, false); + return IRQ_NONE; + } + + gt_irq_handler(tile, master_ctl, intr_dw, identity); + + xe_display_irq_handler(xe, master_ctl); + + gu_misc_iir = gu_misc_irq_ack(xe, master_ctl); + + xelp_intr_enable(xe, false); + + xe_display_irq_enable(xe, gu_misc_iir); + + return IRQ_HANDLED; +} + +static u32 dg1_intr_disable(struct xe_device *xe) +{ + struct xe_gt *mmio = xe_root_mmio_gt(xe); + u32 val; + + /* First disable interrupts */ + xe_mmio_write32(mmio, DG1_MSTR_TILE_INTR, 0); + + /* Get the indication levels and ack the master unit */ + val = xe_mmio_read32(mmio, DG1_MSTR_TILE_INTR); + if (unlikely(!val)) + return 0; + + xe_mmio_write32(mmio, DG1_MSTR_TILE_INTR, val); + + return val; +} + +static void dg1_intr_enable(struct xe_device *xe, bool stall) +{ + struct xe_gt *mmio = xe_root_mmio_gt(xe); + + xe_mmio_write32(mmio, DG1_MSTR_TILE_INTR, DG1_MSTR_IRQ); + if (stall) + xe_mmio_read32(mmio, DG1_MSTR_TILE_INTR); +} + +/* + * Top-level interrupt handler for Xe_LP+ and beyond. These platforms have + * a "master tile" interrupt register which must be consulted before the + * "graphics master" interrupt register. + */ +static irqreturn_t dg1_irq_handler(int irq, void *arg) +{ + struct xe_device *xe = arg; + struct xe_tile *tile; + u32 master_tile_ctl, master_ctl = 0, gu_misc_iir = 0; + unsigned long intr_dw[2]; + u32 identity[32]; + u8 id; + + /* TODO: This really shouldn't be copied+pasted */ + + spin_lock(&xe->irq.lock); + if (!xe->irq.enabled) { + spin_unlock(&xe->irq.lock); + return IRQ_NONE; + } + spin_unlock(&xe->irq.lock); + + master_tile_ctl = dg1_intr_disable(xe); + if (!master_tile_ctl) { + dg1_intr_enable(xe, false); + return IRQ_NONE; + } + + for_each_tile(tile, xe, id) { + struct xe_gt *mmio = tile->primary_gt; + + if ((master_tile_ctl & DG1_MSTR_TILE(tile->id)) == 0) + continue; + + master_ctl = xe_mmio_read32(mmio, GFX_MSTR_IRQ); + + /* + * We might be in irq handler just when PCIe DPC is initiated + * and all MMIO reads will be returned with all 1's. Ignore this + * irq as device is inaccessible. + */ + if (master_ctl == REG_GENMASK(31, 0)) { + dev_dbg(tile_to_xe(tile)->drm.dev, + "Ignore this IRQ as device might be in DPC containment.\n"); + return IRQ_HANDLED; + } + + xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl); + + gt_irq_handler(tile, master_ctl, intr_dw, identity); + + /* + * Display interrupts (including display backlight operations + * that get reported as Gunit GSE) would only be hooked up to + * the primary tile. + */ + if (id == 0) { + xe_display_irq_handler(xe, master_ctl); + gu_misc_iir = gu_misc_irq_ack(xe, master_ctl); + } + } + + dg1_intr_enable(xe, false); + xe_display_irq_enable(xe, gu_misc_iir); + + return IRQ_HANDLED; +} + +static void gt_irq_reset(struct xe_tile *tile) +{ + struct xe_gt *mmio = tile->primary_gt; + + u32 ccs_mask = xe_hw_engine_mask_per_class(tile->primary_gt, + XE_ENGINE_CLASS_COMPUTE); + u32 bcs_mask = xe_hw_engine_mask_per_class(tile->primary_gt, + XE_ENGINE_CLASS_COPY); + + /* Disable RCS, BCS, VCS and VECS class engines. */ + xe_mmio_write32(mmio, RENDER_COPY_INTR_ENABLE, 0); + xe_mmio_write32(mmio, VCS_VECS_INTR_ENABLE, 0); + if (ccs_mask) + xe_mmio_write32(mmio, CCS_RSVD_INTR_ENABLE, 0); + + /* Restore masks irqs on RCS, BCS, VCS and VECS engines. */ + xe_mmio_write32(mmio, RCS0_RSVD_INTR_MASK, ~0); + xe_mmio_write32(mmio, BCS_RSVD_INTR_MASK, ~0); + if (bcs_mask & (BIT(1)|BIT(2))) + xe_mmio_write32(mmio, XEHPC_BCS1_BCS2_INTR_MASK, ~0); + if (bcs_mask & (BIT(3)|BIT(4))) + xe_mmio_write32(mmio, XEHPC_BCS3_BCS4_INTR_MASK, ~0); + if (bcs_mask & (BIT(5)|BIT(6))) + xe_mmio_write32(mmio, XEHPC_BCS5_BCS6_INTR_MASK, ~0); + if (bcs_mask & (BIT(7)|BIT(8))) + xe_mmio_write32(mmio, XEHPC_BCS7_BCS8_INTR_MASK, ~0); + xe_mmio_write32(mmio, VCS0_VCS1_INTR_MASK, ~0); + xe_mmio_write32(mmio, VCS2_VCS3_INTR_MASK, ~0); + xe_mmio_write32(mmio, VECS0_VECS1_INTR_MASK, ~0); + if (ccs_mask & (BIT(0)|BIT(1))) + xe_mmio_write32(mmio, CCS0_CCS1_INTR_MASK, ~0); + if (ccs_mask & (BIT(2)|BIT(3))) + xe_mmio_write32(mmio, CCS2_CCS3_INTR_MASK, ~0); + + if ((tile->media_gt && + xe_hw_engine_mask_per_class(tile->media_gt, XE_ENGINE_CLASS_OTHER)) || + HAS_HECI_GSCFI(tile_to_xe(tile))) { + xe_mmio_write32(mmio, GUNIT_GSC_INTR_ENABLE, 0); + xe_mmio_write32(mmio, GUNIT_GSC_INTR_MASK, ~0); + } + + xe_mmio_write32(mmio, GPM_WGBOXPERF_INTR_ENABLE, 0); + xe_mmio_write32(mmio, GPM_WGBOXPERF_INTR_MASK, ~0); + xe_mmio_write32(mmio, GUC_SG_INTR_ENABLE, 0); + xe_mmio_write32(mmio, GUC_SG_INTR_MASK, ~0); +} + +static void xelp_irq_reset(struct xe_tile *tile) +{ + xelp_intr_disable(tile_to_xe(tile)); + + gt_irq_reset(tile); + + mask_and_disable(tile, PCU_IRQ_OFFSET); +} + +static void dg1_irq_reset(struct xe_tile *tile) +{ + if (tile->id == 0) + dg1_intr_disable(tile_to_xe(tile)); + + gt_irq_reset(tile); + + mask_and_disable(tile, PCU_IRQ_OFFSET); +} + +static void dg1_irq_reset_mstr(struct xe_tile *tile) +{ + struct xe_gt *mmio = tile->primary_gt; + + xe_mmio_write32(mmio, GFX_MSTR_IRQ, ~0); +} + +static void xe_irq_reset(struct xe_device *xe) +{ + struct xe_tile *tile; + u8 id; + + for_each_tile(tile, xe, id) { + if (GRAPHICS_VERx100(xe) >= 1210) + dg1_irq_reset(tile); + else + xelp_irq_reset(tile); + } + + tile = xe_device_get_root_tile(xe); + mask_and_disable(tile, GU_MISC_IRQ_OFFSET); + xe_display_irq_reset(xe); + + /* + * The tile's top-level status register should be the last one + * to be reset to avoid possible bit re-latching from lower + * level interrupts. + */ + if (GRAPHICS_VERx100(xe) >= 1210) { + for_each_tile(tile, xe, id) + dg1_irq_reset_mstr(tile); + } +} + +static void xe_irq_postinstall(struct xe_device *xe) +{ + xe_display_irq_postinstall(xe, xe_root_mmio_gt(xe)); + + /* + * ASLE backlight operations are reported via GUnit GSE interrupts + * on the root tile. + */ + unmask_and_enable(xe_device_get_root_tile(xe), + GU_MISC_IRQ_OFFSET, GU_MISC_GSE); + + /* Enable top-level interrupts */ + if (GRAPHICS_VERx100(xe) >= 1210) + dg1_intr_enable(xe, true); + else + xelp_intr_enable(xe, true); +} + +static irq_handler_t xe_irq_handler(struct xe_device *xe) +{ + if (GRAPHICS_VERx100(xe) >= 1210) + return dg1_irq_handler; + else + return xelp_irq_handler; +} + +static void irq_uninstall(struct drm_device *drm, void *arg) +{ + struct xe_device *xe = arg; + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + int irq; + + if (!xe->irq.enabled) + return; + + xe->irq.enabled = false; + xe_irq_reset(xe); + + irq = pci_irq_vector(pdev, 0); + free_irq(irq, xe); +} + +int xe_irq_install(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + irq_handler_t irq_handler; + int err, irq; + + irq_handler = xe_irq_handler(xe); + if (!irq_handler) { + drm_err(&xe->drm, "No supported interrupt handler"); + return -EINVAL; + } + + xe_irq_reset(xe); + + err = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI | PCI_IRQ_MSIX); + if (err < 0) { + drm_err(&xe->drm, "MSI/MSIX: Failed to enable support %d\n", err); + return err; + } + + irq = pci_irq_vector(pdev, 0); + err = request_irq(irq, irq_handler, IRQF_SHARED, DRIVER_NAME, xe); + if (err < 0) { + drm_err(&xe->drm, "Failed to request MSI/MSIX IRQ %d\n", err); + return err; + } + + xe->irq.enabled = true; + + xe_irq_postinstall(xe); + + err = drmm_add_action_or_reset(&xe->drm, irq_uninstall, xe); + if (err) + goto free_irq_handler; + + return 0; + +free_irq_handler: + free_irq(irq, xe); + + return err; +} + +void xe_irq_shutdown(struct xe_device *xe) +{ + irq_uninstall(&xe->drm, xe); +} + +void xe_irq_suspend(struct xe_device *xe) +{ + int irq = to_pci_dev(xe->drm.dev)->irq; + + spin_lock_irq(&xe->irq.lock); + xe->irq.enabled = false; /* no new irqs */ + spin_unlock_irq(&xe->irq.lock); + + synchronize_irq(irq); /* flush irqs */ + xe_irq_reset(xe); /* turn irqs off */ +} + +void xe_irq_resume(struct xe_device *xe) +{ + struct xe_gt *gt; + int id; + + /* + * lock not needed: + * 1. no irq will arrive before the postinstall + * 2. display is not yet resumed + */ + xe->irq.enabled = true; + xe_irq_reset(xe); + xe_irq_postinstall(xe); /* turn irqs on */ + + for_each_gt(gt, xe, id) + xe_irq_enable_hwe(gt); +} diff --git a/drivers/gpu/drm/xe/xe_irq.h b/drivers/gpu/drm/xe/xe_irq.h new file mode 100644 index 000000000000..bc42bc90d967 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_irq.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_IRQ_H_ +#define _XE_IRQ_H_ + +struct xe_device; +struct xe_tile; +struct xe_gt; + +int xe_irq_install(struct xe_device *xe); +void xe_irq_shutdown(struct xe_device *xe); +void xe_irq_suspend(struct xe_device *xe); +void xe_irq_resume(struct xe_device *xe); +void xe_irq_enable_hwe(struct xe_gt *gt); + +#endif diff --git a/drivers/gpu/drm/xe/xe_lmtt.c b/drivers/gpu/drm/xe/xe_lmtt.c new file mode 100644 index 000000000000..0d7c5514e092 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_lmtt.c @@ -0,0 +1,506 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/align.h> + +#include <drm/drm_managed.h> + +#include "regs/xe_sriov_regs.h" + +#include "xe_assert.h" +#include "xe_bo.h" +#include "xe_lmtt.h" +#include "xe_map.h" +#include "xe_mmio.h" +#include "xe_res_cursor.h" +#include "xe_sriov.h" +#include "xe_sriov_printk.h" + +/** + * DOC: Local Memory Translation Table + * + * The Local Memory Translation Table (LMTT) provides additional abstraction + * when Virtual Function (VF) is accessing device Local Memory (VRAM). + * + * The Root LMTT Page Directory contains one entry for each VF. Entries are + * indexed by the function number (1-based, index 0 is unused). + * + * See `Two-Level LMTT Structure`_ and `Multi-Level LMTT Structure`_. + */ + +#define lmtt_assert(lmtt, condition) xe_tile_assert(lmtt_to_tile(lmtt), condition) +#define lmtt_debug(lmtt, msg...) xe_sriov_dbg_verbose(lmtt_to_xe(lmtt), "LMTT: " msg) + +static bool xe_has_multi_level_lmtt(struct xe_device *xe) +{ + return xe->info.platform == XE_PVC; +} + +static struct xe_tile *lmtt_to_tile(struct xe_lmtt *lmtt) +{ + return container_of(lmtt, struct xe_tile, sriov.pf.lmtt); +} + +static struct xe_device *lmtt_to_xe(struct xe_lmtt *lmtt) +{ + return tile_to_xe(lmtt_to_tile(lmtt)); +} + +static u64 lmtt_page_size(struct xe_lmtt *lmtt) +{ + return BIT_ULL(lmtt->ops->lmtt_pte_shift(0)); +} + +static struct xe_lmtt_pt *lmtt_pt_alloc(struct xe_lmtt *lmtt, unsigned int level) +{ + unsigned int num_entries = level ? lmtt->ops->lmtt_pte_num(level) : 0; + struct xe_lmtt_pt *pt; + struct xe_bo *bo; + int err; + + pt = kzalloc(struct_size(pt, entries, num_entries), GFP_KERNEL); + if (!pt) { + err = -ENOMEM; + goto out; + } + + bo = xe_bo_create_pin_map(lmtt_to_xe(lmtt), lmtt_to_tile(lmtt), NULL, + PAGE_ALIGN(lmtt->ops->lmtt_pte_size(level) * + lmtt->ops->lmtt_pte_num(level)), + ttm_bo_type_kernel, + XE_BO_CREATE_VRAM_IF_DGFX(lmtt_to_tile(lmtt)) | + XE_BO_CREATE_PINNED_BIT); + if (IS_ERR(bo)) { + err = PTR_ERR(bo); + goto out_free_pt; + } + + lmtt_assert(lmtt, xe_bo_is_vram(bo)); + + pt->level = level; + pt->bo = bo; + return pt; + +out_free_pt: + kfree(pt); +out: + return ERR_PTR(err); +} + +static void lmtt_pt_free(struct xe_lmtt_pt *pt) +{ + xe_bo_unpin_map_no_vm(pt->bo); + kfree(pt); +} + +static int lmtt_init_pd(struct xe_lmtt *lmtt) +{ + struct xe_lmtt_pt *pd; + + lmtt_assert(lmtt, !lmtt->pd); + lmtt_assert(lmtt, lmtt->ops->lmtt_root_pd_level()); + + pd = lmtt_pt_alloc(lmtt, lmtt->ops->lmtt_root_pd_level()); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + lmtt->pd = pd; + return 0; +} + +static void lmtt_fini_pd(struct xe_lmtt *lmtt) +{ + struct xe_lmtt_pt *pd = lmtt->pd; + unsigned int num_entries = lmtt->ops->lmtt_pte_num(pd->level); + unsigned int n = 0; + + /* make sure we don't leak */ + for (n = 0; n < num_entries; n++) + lmtt_assert(lmtt, !pd->entries[n]); + + lmtt->pd = NULL; + lmtt_pt_free(pd); +} + +static void fini_lmtt(struct drm_device *drm, void *arg) +{ + struct xe_lmtt *lmtt = arg; + + lmtt_assert(lmtt, !(!!lmtt->ops ^ !!lmtt->pd)); + + if (!lmtt->pd) + return; + + lmtt_fini_pd(lmtt); + lmtt->ops = NULL; +} + +/** + * xe_lmtt_init - LMTT software initialization. + * @lmtt: the &xe_lmtt to initialize + * + * The LMTT initialization requires two steps. + * + * The xe_lmtt_init() checks if LMTT is required on current device and selects + * and initialize proper variant of the LMTT Root Directory. Currently supported + * variants are `Two-Level LMTT Structure`_ and `Multi-Level LMTT Structure`_. + * + * In next step xe_lmtt_init_hw() will register this directory on the hardware. + * + * Notes: + * The LMTT allocations are managed and will be implicitly released on driver unload. + * This function shall be called only once and only when running as a PF driver. + * Any LMTT initialization failure should block VFs enabling. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_lmtt_init(struct xe_lmtt *lmtt) +{ + struct xe_device *xe = lmtt_to_xe(lmtt); + int err; + + lmtt_assert(lmtt, IS_SRIOV_PF(xe)); + lmtt_assert(lmtt, !lmtt->ops); + + if (!IS_DGFX(xe)) + return 0; + + if (xe_has_multi_level_lmtt(xe)) + lmtt->ops = &lmtt_ml_ops; + else + lmtt->ops = &lmtt_2l_ops; + + err = lmtt_init_pd(lmtt); + if (unlikely(err)) + goto fail; + + return drmm_add_action_or_reset(&xe->drm, fini_lmtt, lmtt); + +fail: + lmtt->ops = NULL; + return err; +} + +static void lmtt_setup_dir_ptr(struct xe_lmtt *lmtt) +{ + struct xe_tile *tile = lmtt_to_tile(lmtt); + struct xe_device *xe = tile_to_xe(tile); + dma_addr_t offset = xe_bo_main_addr(lmtt->pd->bo, XE_PAGE_SIZE); + + lmtt_debug(lmtt, "DIR offset %pad\n", &offset); + lmtt_assert(lmtt, xe_bo_is_vram(lmtt->pd->bo)); + lmtt_assert(lmtt, IS_ALIGNED(offset, SZ_64K)); + + xe_mmio_write32(tile->primary_gt, + GRAPHICS_VER(xe) >= 20 ? XE2_LMEM_CFG : LMEM_CFG, + LMEM_EN | REG_FIELD_PREP(LMTT_DIR_PTR, offset / SZ_64K)); +} + +/** + * xe_lmtt_init_hw - Perform LMTT hardware initialization. + * @lmtt: the &xe_lmtt to initialize + * + * This function is a second step of the LMTT initialization. + * This function registers LMTT Root Directory prepared in xe_lmtt_init(). + * + * This function shall be called after every hardware reset. + * This function shall be called only when running as a PF driver. + */ +void xe_lmtt_init_hw(struct xe_lmtt *lmtt) +{ + if (!lmtt->pd) + return; + + lmtt_setup_dir_ptr(lmtt); +} + +static void lmtt_write_pte(struct xe_lmtt *lmtt, struct xe_lmtt_pt *pt, + u64 pte, unsigned int idx) +{ + unsigned int level = pt->level; + + lmtt_assert(lmtt, idx <= lmtt->ops->lmtt_pte_num(level)); + lmtt_debug(lmtt, "WRITE level=%u index=%u pte=%#llx\n", level, idx, pte); + + switch (lmtt->ops->lmtt_pte_size(level)) { + case sizeof(u32): + xe_map_wr(lmtt_to_xe(lmtt), &pt->bo->vmap, idx * sizeof(u32), u32, pte); + break; + case sizeof(u64): + xe_map_wr(lmtt_to_xe(lmtt), &pt->bo->vmap, idx * sizeof(u64), u64, pte); + break; + default: + lmtt_assert(lmtt, !!!"invalid pte size"); + } +} + +static void lmtt_destroy_pt(struct xe_lmtt *lmtt, struct xe_lmtt_pt *pd) +{ + unsigned int num_entries = pd->level ? lmtt->ops->lmtt_pte_num(pd->level) : 0; + struct xe_lmtt_pt *pt; + unsigned int i; + + for (i = 0; i < num_entries; i++) { + pt = pd->entries[i]; + pd->entries[i] = NULL; + if (!pt) + continue; + + lmtt_destroy_pt(lmtt, pt); + } + + lmtt_pt_free(pd); +} + +static void lmtt_drop_pages(struct xe_lmtt *lmtt, unsigned int vfid) +{ + struct xe_lmtt_pt *pd = lmtt->pd; + struct xe_lmtt_pt *pt; + + pt = pd->entries[vfid]; + pd->entries[vfid] = NULL; + if (!pt) + return; + + lmtt_write_pte(lmtt, pd, LMTT_PTE_INVALID, vfid); + + lmtt_assert(lmtt, pd->level > 0); + lmtt_assert(lmtt, pt->level == pd->level - 1); + lmtt_destroy_pt(lmtt, pt); +} + +static int __lmtt_alloc_range(struct xe_lmtt *lmtt, struct xe_lmtt_pt *pd, + u64 start, u64 end) +{ + u64 pte_addr_shift = BIT_ULL(lmtt->ops->lmtt_pte_shift(pd->level)); + u64 offset; + int err; + + lmtt_assert(lmtt, pd->level > 0); + + offset = start; + while (offset < end) { + struct xe_lmtt_pt *pt; + u64 next, pde, pt_addr; + unsigned int idx; + + pt = lmtt_pt_alloc(lmtt, pd->level - 1); + if (IS_ERR(pt)) + return PTR_ERR(pt); + + pt_addr = xe_bo_main_addr(pt->bo, XE_PAGE_SIZE); + + idx = lmtt->ops->lmtt_pte_index(offset, pd->level); + pde = lmtt->ops->lmtt_pte_encode(pt_addr, pd->level); + + lmtt_write_pte(lmtt, pd, pde, idx); + + pd->entries[idx] = pt; + + next = min(end, round_up(offset + 1, pte_addr_shift)); + + if (pt->level != 0) { + err = __lmtt_alloc_range(lmtt, pt, offset, next); + if (err) + return err; + } + + offset = next; + } + + return 0; +} + +static int lmtt_alloc_range(struct xe_lmtt *lmtt, unsigned int vfid, u64 start, u64 end) +{ + struct xe_lmtt_pt *pd = lmtt->pd; + struct xe_lmtt_pt *pt; + u64 pt_addr; + u64 pde; + int err; + + lmtt_assert(lmtt, pd->level > 0); + lmtt_assert(lmtt, vfid <= lmtt->ops->lmtt_pte_num(pd->level)); + lmtt_assert(lmtt, IS_ALIGNED(start, lmtt_page_size(lmtt))); + lmtt_assert(lmtt, IS_ALIGNED(end, lmtt_page_size(lmtt))); + + if (pd->entries[vfid]) + return -ENOTEMPTY; + + pt = lmtt_pt_alloc(lmtt, pd->level - 1); + if (IS_ERR(pt)) + return PTR_ERR(pt); + + pt_addr = xe_bo_main_addr(pt->bo, XE_PAGE_SIZE); + + pde = lmtt->ops->lmtt_pte_encode(pt_addr, pd->level); + + lmtt_write_pte(lmtt, pd, pde, vfid); + + pd->entries[vfid] = pt; + + if (pt->level != 0) { + err = __lmtt_alloc_range(lmtt, pt, start, end); + if (err) + goto out_free_pt; + } + + return 0; + +out_free_pt: + lmtt_pt_free(pt); + return err; +} + +static struct xe_lmtt_pt *lmtt_leaf_pt(struct xe_lmtt *lmtt, unsigned int vfid, u64 addr) +{ + struct xe_lmtt_pt *pd = lmtt->pd; + struct xe_lmtt_pt *pt; + + lmtt_assert(lmtt, vfid <= lmtt->ops->lmtt_pte_num(pd->level)); + pt = pd->entries[vfid]; + + while (pt->level) { + lmtt_assert(lmtt, lmtt->ops->lmtt_pte_index(addr, pt->level) <= + lmtt->ops->lmtt_pte_num(pt->level)); + + pt = pt->entries[lmtt->ops->lmtt_pte_index(addr, pt->level)]; + + addr >>= lmtt->ops->lmtt_pte_shift(pt->level); + } + + lmtt_assert(lmtt, lmtt->ops->lmtt_pte_index(addr, pt->level) <= + lmtt->ops->lmtt_pte_num(pt->level)); + lmtt_assert(lmtt, pt->level != pd->level); + lmtt_assert(lmtt, pt->level == 0); + return pt; +} + +static void lmtt_insert_bo(struct xe_lmtt *lmtt, unsigned int vfid, struct xe_bo *bo, u64 start) +{ + u64 page_size = lmtt_page_size(lmtt); + struct xe_res_cursor cur; + struct xe_lmtt_pt *pt; + u64 addr, vram_offset; + + lmtt_assert(lmtt, IS_ALIGNED(start, page_size)); + lmtt_assert(lmtt, IS_ALIGNED(bo->size, page_size)); + lmtt_assert(lmtt, xe_bo_is_vram(bo)); + + vram_offset = vram_region_gpu_offset(bo->ttm.resource); + xe_res_first(bo->ttm.resource, 0, bo->size, &cur); + while (cur.remaining) { + addr = xe_res_dma(&cur); + addr += vram_offset; /* XXX */ + + pt = lmtt_leaf_pt(lmtt, vfid, start); + + lmtt_write_pte(lmtt, pt, lmtt->ops->lmtt_pte_encode(addr, 0), + lmtt->ops->lmtt_pte_index(start, 0)); + + xe_res_next(&cur, page_size); + start += page_size; + } +} + +/** + * xe_lmtt_prepare_pages - Create VF's LMTT Page Tables. + * @lmtt: the &xe_lmtt to update + * @vfid: the VF identifier (1-based) + * @range: top range of LMEM offset to be supported + * + * This function creates empty LMTT page tables for given VF to support + * up to maximum #range LMEM offset. The LMTT page tables created by this + * function must be released using xe_lmtt_drop_pages() function. + * + * Notes: + * This function shall be called only after successful LMTT initialization. + * See xe_lmtt_init(). + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_lmtt_prepare_pages(struct xe_lmtt *lmtt, unsigned int vfid, u64 range) +{ + lmtt_assert(lmtt, lmtt->pd); + lmtt_assert(lmtt, vfid); + + return lmtt_alloc_range(lmtt, vfid, 0, range); +} + +/** + * xe_lmtt_populate_pages - Update VF's LMTT Page Table Entries. + * @lmtt: the &xe_lmtt to update + * @vfid: the VF identifier (1-based) + * @bo: the buffer object with LMEM allocation to be mapped + * @offset: the offset at which #bo should be mapped + * + * This function updates VF's LMTT entries to use given buffer object as a backstore. + * + * Notes: + * This function shall be called only after successful preparation of the + * VF's LMTT Page Tables. See xe_lmtt_prepare(). + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_lmtt_populate_pages(struct xe_lmtt *lmtt, unsigned int vfid, struct xe_bo *bo, u64 offset) +{ + lmtt_assert(lmtt, lmtt->pd); + lmtt_assert(lmtt, vfid); + + lmtt_insert_bo(lmtt, vfid, bo, offset); + return 0; +} + +/** + * xe_lmtt_drop_pages - Remove VF's LMTT Pages. + * @lmtt: the &xe_lmtt to update + * @vfid: the VF identifier (1-based) + * + * This function removes all LMTT Page Tables prepared by xe_lmtt_prepare_pages(). + * + * This function shall be called only after successful LMTT initialization. + * See xe_lmtt_init(). + */ +void xe_lmtt_drop_pages(struct xe_lmtt *lmtt, unsigned int vfid) +{ + lmtt_assert(lmtt, lmtt->pd); + lmtt_assert(lmtt, vfid); + + lmtt_drop_pages(lmtt, vfid); +} + +/** + * xe_lmtt_estimate_pt_size - Estimate size of LMTT PT allocations. + * @lmtt: the &xe_lmtt + * @size: the size of the LMEM to be mapped over LMTT (including any offset) + * + * This function shall be called only by PF. + * + * Return: size of the PT allocation(s) needed to support given LMEM size. + */ +u64 xe_lmtt_estimate_pt_size(struct xe_lmtt *lmtt, u64 size) +{ + unsigned int level = 0; + u64 pt_size; + + lmtt_assert(lmtt, IS_SRIOV_PF(lmtt_to_xe(lmtt))); + lmtt_assert(lmtt, IS_DGFX(lmtt_to_xe(lmtt))); + lmtt_assert(lmtt, lmtt->ops); + + pt_size = PAGE_ALIGN(lmtt->ops->lmtt_pte_size(level) * + lmtt->ops->lmtt_pte_num(level)); + + while (++level < lmtt->ops->lmtt_root_pd_level()) { + pt_size *= lmtt->ops->lmtt_pte_index(size, level) + 1; + pt_size += PAGE_ALIGN(lmtt->ops->lmtt_pte_size(level) * + lmtt->ops->lmtt_pte_num(level)); + } + + return pt_size; +} + +#if IS_BUILTIN(CONFIG_DRM_XE_KUNIT_TEST) +#include "tests/xe_lmtt_test.c" +#endif diff --git a/drivers/gpu/drm/xe/xe_lmtt.h b/drivers/gpu/drm/xe/xe_lmtt.h new file mode 100644 index 000000000000..cb10ef994db6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_lmtt.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_LMTT_H_ +#define _XE_LMTT_H_ + +#include <linux/types.h> + +struct xe_bo; +struct xe_lmtt; +struct xe_lmtt_ops; + +#ifdef CONFIG_PCI_IOV +int xe_lmtt_init(struct xe_lmtt *lmtt); +void xe_lmtt_init_hw(struct xe_lmtt *lmtt); +int xe_lmtt_prepare_pages(struct xe_lmtt *lmtt, unsigned int vfid, u64 range); +int xe_lmtt_populate_pages(struct xe_lmtt *lmtt, unsigned int vfid, struct xe_bo *bo, u64 offset); +void xe_lmtt_drop_pages(struct xe_lmtt *lmtt, unsigned int vfid); +u64 xe_lmtt_estimate_pt_size(struct xe_lmtt *lmtt, u64 size); +#else +static inline int xe_lmtt_init(struct xe_lmtt *lmtt) { return 0; } +static inline void xe_lmtt_init_hw(struct xe_lmtt *lmtt) { } +#endif + +#endif diff --git a/drivers/gpu/drm/xe/xe_lmtt_2l.c b/drivers/gpu/drm/xe/xe_lmtt_2l.c new file mode 100644 index 000000000000..84bc5c4212b5 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_lmtt_2l.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/align.h> +#include <linux/bitfield.h> +#include <linux/log2.h> +#include <linux/sizes.h> + +#include "xe_lmtt_types.h" +#include "xe_macros.h" + +/** + * DOC: Two-Level LMTT Structure + * + * LMHAW (Local Memory Host Address Width) is 37 bit (128GB) + * + * LMGAW (Local Memory Guest Address Width) is 37 bit (128GB) + * + * The following figure illustrates the structure and function of the 2L LMTT:: + * + * LMTT Directory + * (1 Entry per VF) + * +-----------+ LMTT (per VF) + * | | +-----------+ + * | | | | + * | | index: | | + * | | LMEM VF +===========+ + * | | offset --> | PTE | ==> LMEM PF offset + * | | +===========+ + * index: +===========+ | | + * VFID --> | PDE | -----------------> +-----------+ + * +===========+ / \. + * | | / \. + * | | / \. + * | | / \. + * +-----------+ <== [LMTT Directory Ptr] \. + * / \ / \. + * / \ +-----------+-----------------+------+---+ + * / \ | 31:HAW-16 | HAW-17:5 | 4:1 | 0 | + * / \ +===========+=================+======+===+ + * / \ | Reserved | LMEM Page (2MB) | Rsvd | V | + * / \ +-----------+-----------------+------+---+ + * / \. + * +-----------+-----------------+------+---+ + * | 31:HAW-12 | HAW-13:4 | 3:1 | 0 | + * +===========+=================+======+===+ + * | Reserved | LMTT Ptr (64KB) | Rsvd | V | + * +-----------+-----------------+------+---+ + * + */ + +typedef u32 lmtt_2l_pde_t; +typedef u32 lmtt_2l_pte_t; + +#if IS_ENABLED(CONFIG_DRM_XE_LMTT_2L_128GB) +#define LMTT_2L_HAW 37 /* 128 GiB */ +#else +#define LMTT_2L_HAW 35 /* 32 GiB */ +#endif + +#define LMTT_2L_PDE_MAX_NUM 64 /* SRIOV with PF and 63 VFs, index 0 (PF) is unused */ +#define LMTT_2L_PDE_LMTT_PTR GENMASK(LMTT_2L_HAW - 13, 4) +#define LMTT_2L_PDE_VALID BIT(0) + +#define LMTT_2L_PTE_MAX_NUM BIT(LMTT_2L_HAW - ilog2(SZ_2M)) +#define LMTT_2L_PTE_LMEM_PAGE GENMASK(LMTT_2L_HAW - 17, 5) +#define LMTT_2L_PTE_VALID BIT(0) + +static unsigned int lmtt_2l_root_pd_level(void) +{ + return 1; /* implementation is 0-based */ +} + +static unsigned int lmtt_2l_pte_num(unsigned int level) +{ + switch (level) { + case 1: + return LMTT_2L_PDE_MAX_NUM; + case 0: + BUILD_BUG_ON(LMTT_2L_HAW == 37 && LMTT_2L_PTE_MAX_NUM != SZ_64K); + BUILD_BUG_ON(LMTT_2L_HAW == 35 && LMTT_2L_PTE_MAX_NUM != SZ_16K); + return LMTT_2L_PTE_MAX_NUM; + default: + return 0; + } +} + +static unsigned int lmtt_2l_pte_size(unsigned int level) +{ + switch (level) { + case 1: + return sizeof(lmtt_2l_pde_t); + case 0: + return sizeof(lmtt_2l_pte_t); + default: + return 0; + } +} + +static unsigned int lmtt_2l_pte_shift(unsigned int level) +{ + switch (level) { + case 0: + return ilog2(SZ_2M); + default: + return 0; + } +} + +static unsigned int lmtt_2l_pte_index(u64 addr, unsigned int level) +{ + addr >>= lmtt_2l_pte_shift(level); + + switch (level) { + case 0: + /* SZ_2M increments */ + BUILD_BUG_ON_NOT_POWER_OF_2(LMTT_2L_PTE_MAX_NUM); + return addr & (LMTT_2L_PTE_MAX_NUM - 1); + default: + return 0; + } +} + +static u64 lmtt_2l_pte_encode(unsigned long offset, unsigned int level) +{ + switch (level) { + case 0: + XE_WARN_ON(!IS_ALIGNED(offset, SZ_2M)); + XE_WARN_ON(!FIELD_FIT(LMTT_2L_PTE_LMEM_PAGE, offset / SZ_2M)); + return FIELD_PREP(LMTT_2L_PTE_LMEM_PAGE, offset / SZ_2M) | LMTT_2L_PTE_VALID; + case 1: + XE_WARN_ON(!IS_ALIGNED(offset, SZ_64K)); + XE_WARN_ON(!FIELD_FIT(LMTT_2L_PDE_LMTT_PTR, offset / SZ_64K)); + return FIELD_PREP(LMTT_2L_PDE_LMTT_PTR, offset / SZ_64K) | LMTT_2L_PDE_VALID; + default: + XE_WARN_ON(true); + return 0; + } +} + +const struct xe_lmtt_ops lmtt_2l_ops = { + .lmtt_root_pd_level = lmtt_2l_root_pd_level, + .lmtt_pte_num = lmtt_2l_pte_num, + .lmtt_pte_size = lmtt_2l_pte_size, + .lmtt_pte_shift = lmtt_2l_pte_shift, + .lmtt_pte_index = lmtt_2l_pte_index, + .lmtt_pte_encode = lmtt_2l_pte_encode, +}; diff --git a/drivers/gpu/drm/xe/xe_lmtt_ml.c b/drivers/gpu/drm/xe/xe_lmtt_ml.c new file mode 100644 index 000000000000..b21215a2edd6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_lmtt_ml.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/align.h> +#include <linux/bitfield.h> +#include <linux/log2.h> +#include <linux/sizes.h> + +#include "xe_lmtt_types.h" +#include "xe_macros.h" + +/** + * DOC: Multi-Level LMTT Structure + * + * LMHAW (Local Memory Host Address Width) is 48 bit (256TB) + * + * LMGAW (Local Memory Guest Address Width) is 48 bit (256TB) + * + * The following figure illustrates the structure and function of the ML LMTT:: + * + * LMTT L3 Directory + * (1 Entry per VF) LMTT L1 Leaf + * +-----------+ +-----------+ + * | | LMTT L2 (per VF) | | + * | | +-----------+ | | + * | | | | index: +===========+ + * | | | | GDPA --> | PTE | => LMEM PF offset + * | | | | 34:21 +===========+ + * | | index: | | | | + * | | LMEM VF +===========+ | | + * | | offset -> | PTE | ----------> +-----------+ + * | | GAW-1:35 +===========+ / \. + * index: +===========+ | | / \. + * VFID --> | PDE | ---------> +-----------+ / \. + * +===========+ / / / \. + * | | / / / \. + * +-----------+ <== [LMTT Directory Ptr] / \. + * / \ / / / \. + * / \ / / +-----------+-----------------+------+---+ + * / /\ / | 31:HAW-16 | HAW-17:5 | 4:1 | 0 | + * / / \ / +===========+=================+======+===+ + * / / \ / | Reserved | LMEM Page (2MB) | Rsvd | V | + * / / +-----------+-----------------+------+---+ + * / / + * +-----------+-----------------+------+---+ + * | 63:HAW-12 | HAW-13:4 | 3:1 | 0 | + * +===========+=================+======+===+ + * | Reserved | LMTT Ptr (64KB) | Rsvd | V | + * +-----------+-----------------+------+---+ + * + */ + +typedef u64 lmtt_ml_pde_t; +typedef u32 lmtt_ml_pte_t; + +#define LMTT_ML_HAW 48 /* 256 TiB */ + +#define LMTT_ML_PDE_MAX_NUM 64 /* SRIOV with PF and 63 VFs, index 0 (PF) is unused */ +#define LMTT_ML_PDE_LMTT_PTR GENMASK_ULL(LMTT_ML_HAW - 13, 4) +#define LMTT_ML_PDE_VALID BIT(0) + +#define LMTT_ML_PDE_L2_SHIFT 35 +#define LMTT_ML_PDE_L2_MAX_NUM BIT_ULL(LMTT_ML_HAW - 35) + +#define LMTT_ML_PTE_MAX_NUM BIT(35 - ilog2(SZ_2M)) +#define LMTT_ML_PTE_LMEM_PAGE GENMASK(LMTT_ML_HAW - 17, 5) +#define LMTT_ML_PTE_VALID BIT(0) + +static unsigned int lmtt_ml_root_pd_level(void) +{ + return 2; /* implementation is 0-based */ +} + +static unsigned int lmtt_ml_pte_num(unsigned int level) +{ + switch (level) { + case 2: + return LMTT_ML_PDE_MAX_NUM; + case 1: + BUILD_BUG_ON(LMTT_ML_HAW == 48 && LMTT_ML_PDE_L2_MAX_NUM != SZ_8K); + return LMTT_ML_PDE_L2_MAX_NUM; + case 0: + BUILD_BUG_ON(LMTT_ML_PTE_MAX_NUM != SZ_16K); + return LMTT_ML_PTE_MAX_NUM; + default: + return 0; + } +} + +static unsigned int lmtt_ml_pte_size(unsigned int level) +{ + switch (level) { + case 2: + case 1: + return sizeof(lmtt_ml_pde_t); + case 0: + return sizeof(lmtt_ml_pte_t); + default: + return 0; + } +} + +static unsigned int lmtt_ml_pte_shift(unsigned int level) +{ + switch (level) { + case 1: + BUILD_BUG_ON(BIT_ULL(LMTT_ML_PDE_L2_SHIFT) != SZ_32G); + return ilog2(SZ_32G); + case 0: + return ilog2(SZ_2M); + default: + return 0; + } +} + +static unsigned int lmtt_ml_pte_index(u64 addr, unsigned int level) +{ + addr >>= lmtt_ml_pte_shift(level); + + switch (level) { + case 1: + /* SZ_32G increments */ + BUILD_BUG_ON_NOT_POWER_OF_2(LMTT_ML_PDE_L2_MAX_NUM); + return addr & (LMTT_ML_PDE_L2_MAX_NUM - 1); + case 0: + /* SZ_2M increments */ + BUILD_BUG_ON_NOT_POWER_OF_2(LMTT_ML_PTE_MAX_NUM); + return addr & (LMTT_ML_PTE_MAX_NUM - 1); + default: + return 0; + } +} + +static u64 lmtt_ml_pte_encode(unsigned long offset, unsigned int level) +{ + switch (level) { + case 0: + XE_WARN_ON(!IS_ALIGNED(offset, SZ_2M)); + XE_WARN_ON(!FIELD_FIT(LMTT_ML_PTE_LMEM_PAGE, offset / SZ_2M)); + return FIELD_PREP(LMTT_ML_PTE_LMEM_PAGE, offset / SZ_2M) | LMTT_ML_PTE_VALID; + case 1: + case 2: + XE_WARN_ON(!IS_ALIGNED(offset, SZ_64K)); + XE_WARN_ON(!FIELD_FIT(LMTT_ML_PDE_LMTT_PTR, offset / SZ_64K)); + return FIELD_PREP(LMTT_ML_PDE_LMTT_PTR, offset / SZ_64K) | LMTT_ML_PDE_VALID; + default: + XE_WARN_ON(true); + return 0; + } +} + +const struct xe_lmtt_ops lmtt_ml_ops = { + .lmtt_root_pd_level = lmtt_ml_root_pd_level, + .lmtt_pte_num = lmtt_ml_pte_num, + .lmtt_pte_size = lmtt_ml_pte_size, + .lmtt_pte_shift = lmtt_ml_pte_shift, + .lmtt_pte_index = lmtt_ml_pte_index, + .lmtt_pte_encode = lmtt_ml_pte_encode, +}; diff --git a/drivers/gpu/drm/xe/xe_lmtt_types.h b/drivers/gpu/drm/xe/xe_lmtt_types.h new file mode 100644 index 000000000000..b37abad23416 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_lmtt_types.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_LMTT_TYPES_H_ +#define _XE_LMTT_TYPES_H_ + +#include <linux/types.h> + +struct xe_bo; +struct xe_lmtt; +struct xe_lmtt_pt; +struct xe_lmtt_ops; + +#define LMTT_PTE_INVALID ULL(0) + +/** + * struct xe_lmtt - Local Memory Translation Table Manager + */ +struct xe_lmtt { + /** @pd: root LMTT Directory */ + struct xe_lmtt_pt *pd; + + /** @ops: LMTT functions */ + const struct xe_lmtt_ops *ops; +}; + +/** + * struct xe_lmtt_pt - Local Memory Translation Table Page Table + * + * Represents single level of the LMTT. + */ +struct xe_lmtt_pt { + /** @level: page table level, 0 is leaf */ + unsigned int level; + + /** @bo: buffer object with actual LMTT PTE values */ + struct xe_bo *bo; + + /** @entries: leaf page tables, exist only for root/non-leaf */ + struct xe_lmtt_pt *entries[]; +}; + +/** + * struct xe_lmtt_ops - Local Memory Translation Table Operations + * + * Provides abstraction of the LMTT variants. + */ +struct xe_lmtt_ops { + /* private: */ + unsigned int (*lmtt_root_pd_level)(void); + unsigned int (*lmtt_pte_num)(unsigned int level); + unsigned int (*lmtt_pte_size)(unsigned int level); + unsigned int (*lmtt_pte_shift)(unsigned int level); + unsigned int (*lmtt_pte_index)(u64 addr, unsigned int level); + u64 (*lmtt_pte_encode)(unsigned long offset, unsigned int level); +}; + +extern const struct xe_lmtt_ops lmtt_2l_ops; +extern const struct xe_lmtt_ops lmtt_ml_ops; + +#endif diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c new file mode 100644 index 000000000000..b7fa3831b684 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -0,0 +1,1272 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_lrc.h" + +#include "instructions/xe_mi_commands.h" +#include "instructions/xe_gfxpipe_commands.h" +#include "regs/xe_engine_regs.h" +#include "regs/xe_gpu_commands.h" +#include "regs/xe_lrc_layout.h" +#include "xe_bb.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_drm_client.h" +#include "xe_exec_queue_types.h" +#include "xe_gt.h" +#include "xe_gt_printk.h" +#include "xe_hw_fence.h" +#include "xe_map.h" +#include "xe_vm.h" + +#define CTX_VALID (1 << 0) +#define CTX_PRIVILEGE (1 << 8) +#define CTX_ADDRESSING_MODE_SHIFT 3 +#define LEGACY_64B_CONTEXT 3 + +#define ENGINE_CLASS_SHIFT 61 +#define ENGINE_INSTANCE_SHIFT 48 + +static struct xe_device * +lrc_to_xe(struct xe_lrc *lrc) +{ + return gt_to_xe(lrc->fence_ctx.gt); +} + +size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class) +{ + switch (class) { + case XE_ENGINE_CLASS_RENDER: + if (GRAPHICS_VER(xe) >= 20) + return 4 * SZ_4K; + else + return 14 * SZ_4K; + case XE_ENGINE_CLASS_COMPUTE: + /* 14 pages since graphics_ver == 11 */ + if (GRAPHICS_VER(xe) >= 20) + return 3 * SZ_4K; + else + return 14 * SZ_4K; + default: + WARN(1, "Unknown engine class: %d", class); + fallthrough; + case XE_ENGINE_CLASS_COPY: + case XE_ENGINE_CLASS_VIDEO_DECODE: + case XE_ENGINE_CLASS_VIDEO_ENHANCE: + case XE_ENGINE_CLASS_OTHER: + return 2 * SZ_4K; + } +} + +/* + * The per-platform tables are u8-encoded in @data. Decode @data and set the + * addresses' offset and commands in @regs. The following encoding is used + * for each byte. There are 2 steps: decoding commands and decoding addresses. + * + * Commands: + * [7]: create NOPs - number of NOPs are set in lower bits + * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set + * MI_LRI_FORCE_POSTED + * [5:0]: Number of NOPs or registers to set values to in case of + * MI_LOAD_REGISTER_IMM + * + * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" + * number of registers. They are set by using the REG/REG16 macros: the former + * is used for offsets smaller than 0x200 while the latter is for values bigger + * than that. Those macros already set all the bits documented below correctly: + * + * [7]: When a register offset needs more than 6 bits, use additional bytes, to + * follow, for the lower bits + * [6:0]: Register offset, without considering the engine base. + * + * This function only tweaks the commands and register offsets. Values are not + * filled out. + */ +static void set_offsets(u32 *regs, + const u8 *data, + const struct xe_hw_engine *hwe) +#define NOP(x) (BIT(7) | (x)) +#define LRI(count, flags) ((flags) << 6 | (count) | \ + BUILD_BUG_ON_ZERO(count >= BIT(6))) +#define POSTED BIT(0) +#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) +#define REG16(x) \ + (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ + (((x) >> 2) & 0x7f) +#define END 0 +{ + const u32 base = hwe->mmio_base; + + while (*data) { + u8 count, flags; + + if (*data & BIT(7)) { /* skip */ + count = *data++ & ~BIT(7); + regs += count; + continue; + } + + count = *data & 0x3f; + flags = *data >> 6; + data++; + + *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); + if (flags & POSTED) + *regs |= MI_LRI_FORCE_POSTED; + *regs |= MI_LRI_LRM_CS_MMIO; + regs++; + + xe_gt_assert(hwe->gt, count); + do { + u32 offset = 0; + u8 v; + + do { + v = *data++; + offset <<= 7; + offset |= v & ~BIT(7); + } while (v & BIT(7)); + + regs[0] = base + (offset << 2); + regs += 2; + } while (--count); + } + + *regs = MI_BATCH_BUFFER_END | BIT(0); +} + +static const u8 gen12_xcs_offsets[] = { + NOP(1), + LRI(13, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + + NOP(5), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + END +}; + +static const u8 dg2_xcs_offsets[] = { + NOP(1), + LRI(15, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + REG(0x120), + REG(0x124), + + NOP(1), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + END +}; + +static const u8 gen12_rcs_offsets[] = { + NOP(1), + LRI(13, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + + NOP(5), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + LRI(3, POSTED), + REG(0x1b0), + REG16(0x5a8), + REG16(0x5ac), + + NOP(6), + LRI(1, 0), + REG(0x0c8), + NOP(3 + 9 + 1), + + LRI(51, POSTED), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG(0x028), + REG(0x09c), + REG(0x0c0), + REG(0x178), + REG(0x17c), + REG16(0x358), + REG(0x170), + REG(0x150), + REG(0x154), + REG(0x158), + REG16(0x41c), + REG16(0x600), + REG16(0x604), + REG16(0x608), + REG16(0x60c), + REG16(0x610), + REG16(0x614), + REG16(0x618), + REG16(0x61c), + REG16(0x620), + REG16(0x624), + REG16(0x628), + REG16(0x62c), + REG16(0x630), + REG16(0x634), + REG16(0x638), + REG16(0x63c), + REG16(0x640), + REG16(0x644), + REG16(0x648), + REG16(0x64c), + REG16(0x650), + REG16(0x654), + REG16(0x658), + REG16(0x65c), + REG16(0x660), + REG16(0x664), + REG16(0x668), + REG16(0x66c), + REG16(0x670), + REG16(0x674), + REG16(0x678), + REG16(0x67c), + REG(0x068), + REG(0x084), + NOP(1), + + END +}; + +static const u8 xehp_rcs_offsets[] = { + NOP(1), + LRI(13, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + + NOP(5), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + LRI(3, POSTED), + REG(0x1b0), + REG16(0x5a8), + REG16(0x5ac), + + NOP(6), + LRI(1, 0), + REG(0x0c8), + + END +}; + +static const u8 dg2_rcs_offsets[] = { + NOP(1), + LRI(15, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + REG(0x120), + REG(0x124), + + NOP(1), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + LRI(3, POSTED), + REG(0x1b0), + REG16(0x5a8), + REG16(0x5ac), + + NOP(6), + LRI(1, 0), + REG(0x0c8), + + END +}; + +static const u8 mtl_rcs_offsets[] = { + NOP(1), + LRI(15, POSTED), + REG16(0x244), + REG(0x034), + REG(0x030), + REG(0x038), + REG(0x03c), + REG(0x168), + REG(0x140), + REG(0x110), + REG(0x1c0), + REG(0x1c4), + REG(0x1c8), + REG(0x180), + REG16(0x2b4), + REG(0x120), + REG(0x124), + + NOP(1), + LRI(9, POSTED), + REG16(0x3a8), + REG16(0x28c), + REG16(0x288), + REG16(0x284), + REG16(0x280), + REG16(0x27c), + REG16(0x278), + REG16(0x274), + REG16(0x270), + + NOP(2), + LRI(2, POSTED), + REG16(0x5a8), + REG16(0x5ac), + + NOP(6), + LRI(1, 0), + REG(0x0c8), + + END +}; + +#define XE2_CTX_COMMON \ + NOP(1), /* [0x00] */ \ + LRI(15, POSTED), /* [0x01] */ \ + REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ + REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ + REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ + REG(0x038), /* [0x08] RING_BUFFER_START */ \ + REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ + REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ + REG(0x140), /* [0x0e] BB_ADDR */ \ + REG(0x110), /* [0x10] BB_STATE */ \ + REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ + REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ + REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ + REG(0x180), /* [0x18] CCID */ \ + REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ + REG(0x120), /* [0x1c] PRT_BB_STATE */ \ + REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ + \ + NOP(1), /* [0x20] */ \ + LRI(9, POSTED), /* [0x21] */ \ + REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ + REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ + REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ + REG16(0x284), /* [0x28] dummy reg */ \ + REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ + REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ + REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ + REG16(0x274), /* [0x30] PTBP_UDW */ \ + REG16(0x270) /* [0x32] PTBP_LDW */ + +static const u8 xe2_rcs_offsets[] = { + XE2_CTX_COMMON, + + NOP(2), /* [0x34] */ + LRI(2, POSTED), /* [0x36] */ + REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ + REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ + + NOP(6), /* [0x41] */ + LRI(1, 0), /* [0x47] */ + REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ + + END +}; + +static const u8 xe2_bcs_offsets[] = { + XE2_CTX_COMMON, + + NOP(4 + 8 + 1), /* [0x34] */ + LRI(2, POSTED), /* [0x41] */ + REG16(0x200), /* [0x42] BCS_SWCTRL */ + REG16(0x204), /* [0x44] BLIT_CCTL */ + + END +}; + +static const u8 xe2_xcs_offsets[] = { + XE2_CTX_COMMON, + + END +}; + +#undef END +#undef REG16 +#undef REG +#undef LRI +#undef NOP + +static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) +{ + if (class == XE_ENGINE_CLASS_RENDER) { + if (GRAPHICS_VER(xe) >= 20) + return xe2_rcs_offsets; + else if (GRAPHICS_VERx100(xe) >= 1270) + return mtl_rcs_offsets; + else if (GRAPHICS_VERx100(xe) >= 1255) + return dg2_rcs_offsets; + else if (GRAPHICS_VERx100(xe) >= 1250) + return xehp_rcs_offsets; + else + return gen12_rcs_offsets; + } else if (class == XE_ENGINE_CLASS_COPY) { + if (GRAPHICS_VER(xe) >= 20) + return xe2_bcs_offsets; + else + return gen12_xcs_offsets; + } else { + if (GRAPHICS_VER(xe) >= 20) + return xe2_xcs_offsets; + else if (GRAPHICS_VERx100(xe) >= 1255) + return dg2_xcs_offsets; + else + return gen12_xcs_offsets; + } +} + +static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) +{ + regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH) | + _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | + CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; + + /* TODO: Timestamp */ +} + +static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) +{ + struct xe_device *xe = gt_to_xe(hwe->gt); + + if (GRAPHICS_VERx100(xe) >= 1250) + return 0x70; + else + return 0x60; +} + +static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe) +{ + int x; + + x = lrc_ring_mi_mode(hwe); + regs[x + 1] &= ~STOP_RING; + regs[x + 1] |= STOP_RING << 16; +} + +static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) +{ + return 0; +} + +u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) +{ + return lrc->ring.size; +} + +/* Make the magic macros work */ +#define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset + +#define LRC_SEQNO_PPHWSP_OFFSET 512 +#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8) +#define LRC_PARALLEL_PPHWSP_OFFSET 2048 +#define LRC_PPHWSP_SIZE SZ_4K + +static size_t lrc_reg_size(struct xe_device *xe) +{ + if (GRAPHICS_VERx100(xe) >= 1250) + return 96 * sizeof(u32); + else + return 80 * sizeof(u32); +} + +size_t xe_lrc_skip_size(struct xe_device *xe) +{ + return LRC_PPHWSP_SIZE + lrc_reg_size(xe); +} + +static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) +{ + /* The seqno is stored in the driver-defined portion of PPHWSP */ + return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET; +} + +static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) +{ + /* The start seqno is stored in the driver-defined portion of PPHWSP */ + return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET; +} + +static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) +{ + /* The parallel is stored in the driver-defined portion of PPHWSP */ + return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; +} + +static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc) +{ + return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; +} + +#define DECL_MAP_ADDR_HELPERS(elem) \ +static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ +{ \ + struct iosys_map map = lrc->bo->vmap; \ +\ + xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ + iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ + return map; \ +} \ +static inline u32 __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ +{ \ + return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \ +} \ + +DECL_MAP_ADDR_HELPERS(ring) +DECL_MAP_ADDR_HELPERS(pphwsp) +DECL_MAP_ADDR_HELPERS(seqno) +DECL_MAP_ADDR_HELPERS(regs) +DECL_MAP_ADDR_HELPERS(start_seqno) +DECL_MAP_ADDR_HELPERS(parallel) + +#undef DECL_MAP_ADDR_HELPERS + +u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) +{ + return __xe_lrc_pphwsp_ggtt_addr(lrc); +} + +u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) +{ + struct xe_device *xe = lrc_to_xe(lrc); + struct iosys_map map; + + map = __xe_lrc_regs_map(lrc); + iosys_map_incr(&map, reg_nr * sizeof(u32)); + return xe_map_read32(xe, &map); +} + +void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) +{ + struct xe_device *xe = lrc_to_xe(lrc); + struct iosys_map map; + + map = __xe_lrc_regs_map(lrc); + iosys_map_incr(&map, reg_nr * sizeof(u32)); + xe_map_write32(xe, &map, val); +} + +static void *empty_lrc_data(struct xe_hw_engine *hwe) +{ + struct xe_device *xe = gt_to_xe(hwe->gt); + void *data; + u32 *regs; + + data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL); + if (!data) + return NULL; + + /* 1st page: Per-Process of HW status Page */ + regs = data + LRC_PPHWSP_SIZE; + set_offsets(regs, reg_offsets(xe, hwe->class), hwe); + set_context_control(regs, hwe); + reset_stop_ring(regs, hwe); + + return data; +} + +static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) +{ + u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile); + + xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); + xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); +} + +#define PVC_CTX_ASID (0x2e + 1) +#define PVC_CTX_ACC_CTR_THOLD (0x2a + 1) +#define ACC_GRANULARITY_S 20 +#define ACC_NOTIFY_S 16 + +int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, + struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size) +{ + struct xe_gt *gt = hwe->gt; + struct xe_tile *tile = gt_to_tile(gt); + struct xe_device *xe = gt_to_xe(gt); + struct iosys_map map; + void *init_data = NULL; + u32 arb_enable; + int err; + + lrc->flags = 0; + + /* + * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address + * via VM bind calls. + */ + lrc->bo = xe_bo_create_pin_map(xe, tile, vm, + ring_size + xe_lrc_size(xe, hwe->class), + ttm_bo_type_kernel, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(lrc->bo)) + return PTR_ERR(lrc->bo); + + lrc->tile = gt_to_tile(hwe->gt); + lrc->ring.size = ring_size; + lrc->ring.tail = 0; + + xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, + hwe->fence_irq, hwe->name); + + if (!gt->default_lrc[hwe->class]) { + init_data = empty_lrc_data(hwe); + if (!init_data) { + err = -ENOMEM; + goto err_lrc_finish; + } + } + + /* + * Init Per-Process of HW status Page, LRC / context state to known + * values + */ + map = __xe_lrc_pphwsp_map(lrc); + if (!init_data) { + xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ + xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, + gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, + xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE); + } else { + xe_map_memcpy_to(xe, &map, 0, init_data, + xe_lrc_size(xe, hwe->class)); + kfree(init_data); + } + + if (vm) { + xe_lrc_set_ppgtt(lrc, vm); + + if (vm->xef) + xe_drm_client_add_bo(vm->xef->client, lrc->bo); + } + + xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); + xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0); + xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); + xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, + RING_CTL_SIZE(lrc->ring.size) | RING_VALID); + if (xe->info.has_asid && vm) + xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, + (q->usm.acc_granularity << + ACC_GRANULARITY_S) | vm->usm.asid); + if (xe->info.has_usm && vm) + xe_lrc_write_ctx_reg(lrc, PVC_CTX_ACC_CTR_THOLD, + (q->usm.acc_notify << ACC_NOTIFY_S) | + q->usm.acc_trigger); + + lrc->desc = CTX_VALID; + lrc->desc |= LEGACY_64B_CONTEXT << CTX_ADDRESSING_MODE_SHIFT; + /* TODO: Priority */ + + /* While this appears to have something about privileged batches or + * some such, it really just means PPGTT mode. + */ + if (vm) + lrc->desc |= CTX_PRIVILEGE; + + if (GRAPHICS_VERx100(xe) < 1250) { + lrc->desc |= (u64)hwe->instance << ENGINE_INSTANCE_SHIFT; + lrc->desc |= (u64)hwe->class << ENGINE_CLASS_SHIFT; + } + + arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; + xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); + + map = __xe_lrc_seqno_map(lrc); + xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); + + map = __xe_lrc_start_seqno_map(lrc); + xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); + + return 0; + +err_lrc_finish: + xe_lrc_finish(lrc); + return err; +} + +void xe_lrc_finish(struct xe_lrc *lrc) +{ + xe_hw_fence_ctx_finish(&lrc->fence_ctx); + xe_bo_lock(lrc->bo, false); + xe_bo_unpin(lrc->bo); + xe_bo_unlock(lrc->bo); + xe_bo_put(lrc->bo); +} + +void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) +{ + xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); +} + +u32 xe_lrc_ring_head(struct xe_lrc *lrc) +{ + return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; +} + +u32 xe_lrc_ring_space(struct xe_lrc *lrc) +{ + const u32 head = xe_lrc_ring_head(lrc); + const u32 tail = lrc->ring.tail; + const u32 size = lrc->ring.size; + + return ((head - tail - 1) & (size - 1)) + 1; +} + +static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, + const void *data, size_t size) +{ + struct xe_device *xe = lrc_to_xe(lrc); + + iosys_map_incr(&ring, lrc->ring.tail); + xe_map_memcpy_to(xe, &ring, 0, data, size); + lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); +} + +void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) +{ + struct xe_device *xe = lrc_to_xe(lrc); + struct iosys_map ring; + u32 rhs; + size_t aligned_size; + + xe_assert(xe, IS_ALIGNED(size, 4)); + aligned_size = ALIGN(size, 8); + + ring = __xe_lrc_ring_map(lrc); + + xe_assert(xe, lrc->ring.tail < lrc->ring.size); + rhs = lrc->ring.size - lrc->ring.tail; + if (size > rhs) { + __xe_lrc_write_ring(lrc, ring, data, rhs); + __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); + } else { + __xe_lrc_write_ring(lrc, ring, data, size); + } + + if (aligned_size > size) { + u32 noop = MI_NOOP; + + __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); + } +} + +u64 xe_lrc_descriptor(struct xe_lrc *lrc) +{ + return lrc->desc | xe_lrc_ggtt_addr(lrc); +} + +u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) +{ + return __xe_lrc_seqno_ggtt_addr(lrc); +} + +struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc) +{ + return &xe_hw_fence_create(&lrc->fence_ctx, + __xe_lrc_seqno_map(lrc))->dma; +} + +s32 xe_lrc_seqno(struct xe_lrc *lrc) +{ + struct iosys_map map = __xe_lrc_seqno_map(lrc); + + return xe_map_read32(lrc_to_xe(lrc), &map); +} + +s32 xe_lrc_start_seqno(struct xe_lrc *lrc) +{ + struct iosys_map map = __xe_lrc_start_seqno_map(lrc); + + return xe_map_read32(lrc_to_xe(lrc), &map); +} + +u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) +{ + return __xe_lrc_start_seqno_ggtt_addr(lrc); +} + +u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) +{ + return __xe_lrc_parallel_ggtt_addr(lrc); +} + +struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) +{ + return __xe_lrc_parallel_map(lrc); +} + +static int instr_dw(u32 cmd_header) +{ + /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ + if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == + GFXPIPE_SINGLE_DW_CMD(0, 0)) + return 1; + + /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ + if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) + return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; + + /* Most instructions have the # of dwords (minus 2) in 7:0 */ + return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; +} + +static int dump_mi_command(struct drm_printer *p, + struct xe_gt *gt, + u32 *dw, + int remaining_dw) +{ + u32 inst_header = *dw; + u32 numdw = instr_dw(inst_header); + u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); + int num_noop; + + /* First check for commands that don't have/use a '# DW' field */ + switch (inst_header & MI_OPCODE) { + case MI_NOOP: + num_noop = 1; + while (num_noop < remaining_dw && + (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) + num_noop++; + drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop); + return num_noop; + + case MI_TOPOLOGY_FILTER: + drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header); + return 1; + + case MI_BATCH_BUFFER_END: + drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header); + /* Return 'remaining_dw' to consume the rest of the LRC */ + return remaining_dw; + } + + /* + * Any remaining commands include a # of dwords. We should make sure + * it doesn't exceed the remaining size of the LRC. + */ + if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) + numdw = remaining_dw; + + switch (inst_header & MI_OPCODE) { + case MI_LOAD_REGISTER_IMM: + drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", + inst_header, (numdw - 1) / 2); + for (int i = 1; i < numdw; i += 2) + drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]); + return numdw; + + case MI_FORCE_WAKEUP: + drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header); + return numdw; + + default: + drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n", + inst_header, opcode, numdw); + return numdw; + } +} + +static int dump_gfxpipe_command(struct drm_printer *p, + struct xe_gt *gt, + u32 *dw, + int remaining_dw) +{ + u32 numdw = instr_dw(*dw); + u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); + u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); + u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); + + /* + * Make sure we haven't mis-parsed a number of dwords that exceeds the + * remaining size of the LRC. + */ + if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) + numdw = remaining_dw; + + switch (*dw & GFXPIPE_MATCH_MASK) { +#define MATCH(cmd) \ + case cmd: \ + drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ + return numdw +#define MATCH3D(cmd) \ + case CMD_##cmd: \ + drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ + return numdw + + MATCH(STATE_BASE_ADDRESS); + MATCH(STATE_SIP); + MATCH(GPGPU_CSR_BASE_ADDRESS); + MATCH(STATE_COMPUTE_MODE); + MATCH3D(3DSTATE_BTD); + + MATCH3D(3DSTATE_VF_STATISTICS); + + MATCH(PIPELINE_SELECT); + + MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); + MATCH3D(3DSTATE_CLEAR_PARAMS); + MATCH3D(3DSTATE_DEPTH_BUFFER); + MATCH3D(3DSTATE_STENCIL_BUFFER); + MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); + MATCH3D(3DSTATE_VERTEX_BUFFERS); + MATCH3D(3DSTATE_VERTEX_ELEMENTS); + MATCH3D(3DSTATE_INDEX_BUFFER); + MATCH3D(3DSTATE_VF); + MATCH3D(3DSTATE_MULTISAMPLE); + MATCH3D(3DSTATE_CC_STATE_POINTERS); + MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); + MATCH3D(3DSTATE_VS); + MATCH3D(3DSTATE_GS); + MATCH3D(3DSTATE_CLIP); + MATCH3D(3DSTATE_SF); + MATCH3D(3DSTATE_WM); + MATCH3D(3DSTATE_CONSTANT_VS); + MATCH3D(3DSTATE_CONSTANT_GS); + MATCH3D(3DSTATE_SAMPLE_MASK); + MATCH3D(3DSTATE_CONSTANT_HS); + MATCH3D(3DSTATE_CONSTANT_DS); + MATCH3D(3DSTATE_HS); + MATCH3D(3DSTATE_TE); + MATCH3D(3DSTATE_DS); + MATCH3D(3DSTATE_STREAMOUT); + MATCH3D(3DSTATE_SBE); + MATCH3D(3DSTATE_PS); + MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); + MATCH3D(3DSTATE_CPS_POINTERS); + MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); + MATCH3D(3DSTATE_BLEND_STATE_POINTERS); + MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); + MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); + MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); + MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); + MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); + MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); + MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); + MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); + MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); + MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); + MATCH3D(3DSTATE_VF_INSTANCING); + MATCH3D(3DSTATE_VF_SGVS); + MATCH3D(3DSTATE_VF_TOPOLOGY); + MATCH3D(3DSTATE_WM_CHROMAKEY); + MATCH3D(3DSTATE_PS_BLEND); + MATCH3D(3DSTATE_WM_DEPTH_STENCIL); + MATCH3D(3DSTATE_PS_EXTRA); + MATCH3D(3DSTATE_RASTER); + MATCH3D(3DSTATE_SBE_SWIZ); + MATCH3D(3DSTATE_WM_HZ_OP); + MATCH3D(3DSTATE_VF_COMPONENT_PACKING); + MATCH3D(3DSTATE_VF_SGVS_2); + MATCH3D(3DSTATE_VFG); + MATCH3D(3DSTATE_URB_ALLOC_VS); + MATCH3D(3DSTATE_URB_ALLOC_HS); + MATCH3D(3DSTATE_URB_ALLOC_DS); + MATCH3D(3DSTATE_URB_ALLOC_GS); + MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); + MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); + MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); + MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); + MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); + MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); + MATCH3D(3DSTATE_AMFS); + MATCH3D(3DSTATE_DEPTH_BOUNDS); + MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); + MATCH3D(3DSTATE_CONSTANT_TS_POINTER); + MATCH3D(3DSTATE_MESH_CONTROL); + MATCH3D(3DSTATE_MESH_DISTRIB); + MATCH3D(3DSTATE_TASK_REDISTRIB); + MATCH3D(3DSTATE_MESH_SHADER); + MATCH3D(3DSTATE_MESH_SHADER_DATA); + MATCH3D(3DSTATE_TASK_CONTROL); + MATCH3D(3DSTATE_TASK_SHADER); + MATCH3D(3DSTATE_TASK_SHADER_DATA); + MATCH3D(3DSTATE_URB_ALLOC_MESH); + MATCH3D(3DSTATE_URB_ALLOC_TASK); + MATCH3D(3DSTATE_CLIP_MESH); + MATCH3D(3DSTATE_SBE_MESH); + MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); + + MATCH3D(3DSTATE_DRAWING_RECTANGLE); + MATCH3D(3DSTATE_CHROMA_KEY); + MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); + MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); + MATCH3D(3DSTATE_LINE_STIPPLE); + MATCH3D(3DSTATE_AA_LINE_PARAMETERS); + MATCH3D(3DSTATE_MONOFILTER_SIZE); + MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); + MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); + MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); + MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); + MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); + MATCH3D(3DSTATE_SO_DECL_LIST); + MATCH3D(3DSTATE_SO_BUFFER); + MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); + MATCH3D(3DSTATE_SAMPLE_PATTERN); + MATCH3D(3DSTATE_3D_MODE); + MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); + MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); + MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); + + default: + drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", + *dw, pipeline, opcode, subopcode, numdw); + return numdw; + } +} + +void xe_lrc_dump_default(struct drm_printer *p, + struct xe_gt *gt, + enum xe_engine_class hwe_class) +{ + u32 *dw; + int remaining_dw, num_dw; + + if (!gt->default_lrc[hwe_class]) { + drm_printf(p, "No default LRC for class %d\n", hwe_class); + return; + } + + /* + * Skip the beginning of the LRC since it contains the per-process + * hardware status page. + */ + dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; + remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4; + + while (remaining_dw > 0) { + if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { + num_dw = dump_mi_command(p, gt, dw, remaining_dw); + } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { + num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw); + } else { + num_dw = min(instr_dw(*dw), remaining_dw); + drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n", + *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), + num_dw); + } + + dw += num_dw; + remaining_dw -= num_dw; + } +} + +struct instr_state { + u32 instr; + u16 num_dw; +}; + +static const struct instr_state xe_hpg_svg_state[] = { + { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, + { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, + { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, + { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, + { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, + { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, + { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, + { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, + { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, + { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, + { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, + { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, + { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, + { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, + { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, + { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, + { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, + { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, + { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, + { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, + { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, + { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, + { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, + { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, + { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, + { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, + { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, + { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, + { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, + { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, + { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, + { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, + { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, + { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, + { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, + { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, + { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, + { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, + { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, + { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, + { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, +}; + +void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb) +{ + struct xe_gt *gt = q->hwe->gt; + struct xe_device *xe = gt_to_xe(gt); + const struct instr_state *state_table = NULL; + int state_table_size = 0; + + /* + * At the moment we only need to emit non-register state for the RCS + * engine. + */ + if (q->hwe->class != XE_ENGINE_CLASS_RENDER) + return; + + switch (GRAPHICS_VERx100(xe)) { + case 1255: + case 1270 ... 2004: + state_table = xe_hpg_svg_state; + state_table_size = ARRAY_SIZE(xe_hpg_svg_state); + break; + default: + xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", + GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); + return; + } + + for (int i = 0; i < state_table_size; i++) { + u32 instr = state_table[i].instr; + u16 num_dw = state_table[i].num_dw; + bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); + + xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); + xe_gt_assert(gt, num_dw != 0); + xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); + + /* + * Xe2's SVG context is the same as the one on DG2 / MTL + * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has + * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). + * Just make the replacement here rather than defining a + * whole separate table for the single trivial change. + */ + if (GRAPHICS_VER(xe) >= 20 && + instr == CMD_3DSTATE_DRAWING_RECTANGLE) + instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; + + bb->cs[bb->len] = instr; + if (!is_single_dw) + bb->cs[bb->len] |= (num_dw - 2); + + bb->len += num_dw; + } +} diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h new file mode 100644 index 000000000000..28b1d3f404d4 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_lrc.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ +#ifndef _XE_LRC_H_ +#define _XE_LRC_H_ + +#include "xe_lrc_types.h" + +struct drm_printer; +struct xe_bb; +struct xe_device; +struct xe_exec_queue; +enum xe_engine_class; +struct xe_hw_engine; +struct xe_vm; + +#define LRC_PPHWSP_SCRATCH_ADDR (0x34 * 4) + +int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, + struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size); +void xe_lrc_finish(struct xe_lrc *lrc); + +size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class); +u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc); + +void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head); +u32 xe_lrc_ring_head(struct xe_lrc *lrc); +u32 xe_lrc_ring_space(struct xe_lrc *lrc); +void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size); + +u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc); +u32 *xe_lrc_regs(struct xe_lrc *lrc); + +u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr); +void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val); + +u64 xe_lrc_descriptor(struct xe_lrc *lrc); + +u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc); +struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc); +s32 xe_lrc_seqno(struct xe_lrc *lrc); + +u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc); +s32 xe_lrc_start_seqno(struct xe_lrc *lrc); + +u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc); +struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc); + +size_t xe_lrc_skip_size(struct xe_device *xe); + +void xe_lrc_dump_default(struct drm_printer *p, + struct xe_gt *gt, + enum xe_engine_class); + +void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb); + +#endif diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h new file mode 100644 index 000000000000..78220336062c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_lrc_types.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_LRC_TYPES_H_ +#define _XE_LRC_TYPES_H_ + +#include "xe_hw_fence_types.h" + +struct xe_bo; + +/** + * struct xe_lrc - Logical ring context (LRC) and submission ring object + */ +struct xe_lrc { + /** + * @bo: buffer object (memory) for logical ring context, per process HW + * status page, and submission ring. + */ + struct xe_bo *bo; + + /** @tile: tile which this LRC belongs to */ + struct xe_tile *tile; + + /** @flags: LRC flags */ + u32 flags; + + /** @ring: submission ring state */ + struct { + /** @size: size of submission ring */ + u32 size; + /** @tail: tail of submission ring */ + u32 tail; + /** @old_tail: shadow of tail */ + u32 old_tail; + } ring; + + /** @desc: LRC descriptor */ + u64 desc; + + /** @fence_ctx: context for hw fence */ + struct xe_hw_fence_ctx fence_ctx; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_macros.h b/drivers/gpu/drm/xe/xe_macros.h new file mode 100644 index 000000000000..daf56c846d03 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_macros.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_MACROS_H_ +#define _XE_MACROS_H_ + +#include <linux/bug.h> + +#define XE_WARN_ON WARN_ON + +#define XE_IOCTL_DBG(xe, cond) \ + ((cond) && (drm_dbg(&(xe)->drm, \ + "Ioctl argument check failed at %s:%d: %s", \ + __FILE__, __LINE__, #cond), 1)) + +#endif diff --git a/drivers/gpu/drm/xe/xe_map.h b/drivers/gpu/drm/xe/xe_map.h new file mode 100644 index 000000000000..f62e0c8b67ab --- /dev/null +++ b/drivers/gpu/drm/xe/xe_map.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_MAP_H_ +#define _XE_MAP_H_ + +#include <linux/iosys-map.h> + +#include <xe_device.h> + +/** + * DOC: Map layer + * + * All access to any memory shared with a device (both sysmem and vram) in the + * XE driver should go through this layer (xe_map). This layer is built on top + * of :ref:`driver-api/device-io:Generalizing Access to System and I/O Memory` + * and with extra hooks into the XE driver that allows adding asserts to memory + * accesses (e.g. for blocking runtime_pm D3Cold on Discrete Graphics). + */ + +static inline void xe_map_memcpy_to(struct xe_device *xe, struct iosys_map *dst, + size_t dst_offset, const void *src, + size_t len) +{ + xe_device_assert_mem_access(xe); + iosys_map_memcpy_to(dst, dst_offset, src, len); +} + +static inline void xe_map_memcpy_from(struct xe_device *xe, void *dst, + const struct iosys_map *src, + size_t src_offset, size_t len) +{ + xe_device_assert_mem_access(xe); + iosys_map_memcpy_from(dst, src, src_offset, len); +} + +static inline void xe_map_memset(struct xe_device *xe, + struct iosys_map *dst, size_t offset, + int value, size_t len) +{ + xe_device_assert_mem_access(xe); + iosys_map_memset(dst, offset, value, len); +} + +/* FIXME: We likely should kill these two functions sooner or later */ +static inline u32 xe_map_read32(struct xe_device *xe, struct iosys_map *map) +{ + xe_device_assert_mem_access(xe); + + if (map->is_iomem) + return readl(map->vaddr_iomem); + else + return READ_ONCE(*(u32 *)map->vaddr); +} + +static inline void xe_map_write32(struct xe_device *xe, struct iosys_map *map, + u32 val) +{ + xe_device_assert_mem_access(xe); + + if (map->is_iomem) + writel(val, map->vaddr_iomem); + else + *(u32 *)map->vaddr = val; +} + +#define xe_map_rd(xe__, map__, offset__, type__) ({ \ + struct xe_device *__xe = xe__; \ + xe_device_assert_mem_access(__xe); \ + iosys_map_rd(map__, offset__, type__); \ +}) + +#define xe_map_wr(xe__, map__, offset__, type__, val__) ({ \ + struct xe_device *__xe = xe__; \ + xe_device_assert_mem_access(__xe); \ + iosys_map_wr(map__, offset__, type__, val__); \ +}) + +#define xe_map_rd_field(xe__, map__, struct_offset__, struct_type__, field__) ({ \ + struct xe_device *__xe = xe__; \ + xe_device_assert_mem_access(__xe); \ + iosys_map_rd_field(map__, struct_offset__, struct_type__, field__); \ +}) + +#define xe_map_wr_field(xe__, map__, struct_offset__, struct_type__, field__, val__) ({ \ + struct xe_device *__xe = xe__; \ + xe_device_assert_mem_access(__xe); \ + iosys_map_wr_field(map__, struct_offset__, struct_type__, field__, val__); \ +}) + +#endif diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c new file mode 100644 index 000000000000..adf1dab5eba2 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -0,0 +1,1410 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +#include "xe_migrate.h" + +#include <linux/bitfield.h> +#include <linux/sizes.h> + +#include <drm/drm_managed.h> +#include <drm/ttm/ttm_tt.h> +#include <drm/xe_drm.h> + +#include "generated/xe_wa_oob.h" +#include "instructions/xe_mi_commands.h" +#include "regs/xe_gpu_commands.h" +#include "tests/xe_test.h" +#include "xe_assert.h" +#include "xe_bb.h" +#include "xe_bo.h" +#include "xe_exec_queue.h" +#include "xe_ggtt.h" +#include "xe_gt.h" +#include "xe_hw_engine.h" +#include "xe_lrc.h" +#include "xe_map.h" +#include "xe_mocs.h" +#include "xe_pt.h" +#include "xe_res_cursor.h" +#include "xe_sched_job.h" +#include "xe_sync.h" +#include "xe_trace.h" +#include "xe_vm.h" +#include "xe_wa.h" + +/** + * struct xe_migrate - migrate context. + */ +struct xe_migrate { + /** @q: Default exec queue used for migration */ + struct xe_exec_queue *q; + /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ + struct xe_tile *tile; + /** @job_mutex: Timeline mutex for @eng. */ + struct mutex job_mutex; + /** @pt_bo: Page-table buffer object. */ + struct xe_bo *pt_bo; + /** @batch_base_ofs: VM offset of the migration batch buffer */ + u64 batch_base_ofs; + /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ + u64 usm_batch_base_ofs; + /** @cleared_mem_ofs: VM offset of @cleared_bo. */ + u64 cleared_mem_ofs; + /** + * @fence: dma-fence representing the last migration job batch. + * Protected by @job_mutex. + */ + struct dma_fence *fence; + /** + * @vm_update_sa: For integrated, used to suballocate page-tables + * out of the pt_bo. + */ + struct drm_suballoc_manager vm_update_sa; +}; + +#define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ +#define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */ +#define NUM_KERNEL_PDE 17 +#define NUM_PT_SLOTS 32 +#define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M + +/** + * xe_tile_migrate_engine() - Get this tile's migrate engine. + * @tile: The tile. + * + * Returns the default migrate engine of this tile. + * TODO: Perhaps this function is slightly misplaced, and even unneeded? + * + * Return: The default migrate engine + */ +struct xe_exec_queue *xe_tile_migrate_engine(struct xe_tile *tile) +{ + return tile->migrate->q; +} + +static void xe_migrate_fini(struct drm_device *dev, void *arg) +{ + struct xe_migrate *m = arg; + + xe_vm_lock(m->q->vm, false); + xe_bo_unpin(m->pt_bo); + xe_vm_unlock(m->q->vm); + + dma_fence_put(m->fence); + xe_bo_put(m->pt_bo); + drm_suballoc_manager_fini(&m->vm_update_sa); + mutex_destroy(&m->job_mutex); + xe_vm_close_and_put(m->q->vm); + xe_exec_queue_put(m->q); +} + +static u64 xe_migrate_vm_addr(u64 slot, u32 level) +{ + XE_WARN_ON(slot >= NUM_PT_SLOTS); + + /* First slot is reserved for mapping of PT bo and bb, start from 1 */ + return (slot + 1ULL) << xe_pt_shift(level + 1); +} + +static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr) +{ + /* + * Remove the DPA to get a correct offset into identity table for the + * migrate offset + */ + addr -= xe->mem.vram.dpa_base; + return addr + (256ULL << xe_pt_shift(2)); +} + +static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, + struct xe_vm *vm) +{ + struct xe_device *xe = tile_to_xe(tile); + u16 pat_index = xe->pat.idx[XE_CACHE_WB]; + u8 id = tile->id; + u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; + u32 map_ofs, level, i; + struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; + u64 entry; + + /* Can't bump NUM_PT_SLOTS too high */ + BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); + /* Must be a multiple of 64K to support all platforms */ + BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); + /* And one slot reserved for the 4KiB page table updates */ + BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); + + /* Need to be sure everything fits in the first PT, or create more */ + xe_tile_assert(tile, m->batch_base_ofs + batch->size < SZ_2M); + + bo = xe_bo_create_pin_map(vm->xe, tile, vm, + num_entries * XE_PAGE_SIZE, + ttm_bo_type_kernel, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_PINNED_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + entry = vm->pt_ops->pde_encode_bo(bo, bo->size - XE_PAGE_SIZE, pat_index); + xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); + + map_ofs = (num_entries - num_level) * XE_PAGE_SIZE; + + /* Map the entire BO in our level 0 pt */ + for (i = 0, level = 0; i < num_entries; level++) { + entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, + pat_index, 0); + + xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); + + if (vm->flags & XE_VM_FLAG_64K) + i += 16; + else + i += 1; + } + + if (!IS_DGFX(xe)) { + /* Write out batch too */ + m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; + if (xe->info.has_usm) { + batch = tile->primary_gt->usm.bb_pool->bo; + m->usm_batch_base_ofs = m->batch_base_ofs; + } + + for (i = 0; i < batch->size; + i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : + XE_PAGE_SIZE) { + entry = vm->pt_ops->pte_encode_bo(batch, i, + pat_index, 0); + + xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, + entry); + level++; + } + } else { + u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); + + m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); + + if (xe->info.has_usm) { + batch = tile->primary_gt->usm.bb_pool->bo; + batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); + m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); + } + } + + for (level = 1; level < num_level; level++) { + u32 flags = 0; + + if (vm->flags & XE_VM_FLAG_64K && level == 1) + flags = XE_PDE_64K; + + entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) * + XE_PAGE_SIZE, pat_index); + xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, + entry | flags); + } + + /* Write PDE's that point to our BO. */ + for (i = 0; i < num_entries - num_level; i++) { + entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE, + pat_index); + + xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + + (i + 1) * 8, u64, entry); + } + + /* Set up a 1GiB NULL mapping at 255GiB offset. */ + level = 2; + xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64, + vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) + | XE_PTE_NULL); + m->cleared_mem_ofs = (255ULL << xe_pt_shift(level)); + + /* Identity map the entire vram at 256GiB offset */ + if (IS_DGFX(xe)) { + u64 pos, ofs, flags; + + level = 2; + ofs = map_ofs + XE_PAGE_SIZE * level + 256 * 8; + flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, + true, 0); + + /* + * Use 1GB pages, it shouldn't matter the physical amount of + * vram is less, when we don't access it. + */ + for (pos = xe->mem.vram.dpa_base; + pos < xe->mem.vram.actual_physical_size + xe->mem.vram.dpa_base; + pos += SZ_1G, ofs += 8) + xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); + } + + /* + * Example layout created above, with root level = 3: + * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's + * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's + * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's + * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2] + * + * This makes the lowest part of the VM point to the pagetables. + * Hence the lowest 2M in the vm should point to itself, with a few writes + * and flushes, other parts of the VM can be used either for copying and + * clearing. + * + * For performance, the kernel reserves PDE's, so about 20 are left + * for async VM updates. + * + * To make it easier to work, each scratch PT is put in slot (1 + PT #) + * everywhere, this allows lockless updates to scratch pages by using + * the different addresses in VM. + */ +#define NUM_VMUSA_UNIT_PER_PAGE 32 +#define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) +#define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) + drm_suballoc_manager_init(&m->vm_update_sa, + (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * + NUM_VMUSA_UNIT_PER_PAGE, 0); + + m->pt_bo = bo; + return 0; +} + +/* + * Due to workaround 16017236439, odd instance hardware copy engines are + * faster than even instance ones. + * This function returns the mask involving all fast copy engines and the + * reserved copy engine to be used as logical mask for migrate engine. + * Including the reserved copy engine is required to avoid deadlocks due to + * migrate jobs servicing the faults gets stuck behind the job that faulted. + */ +static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) +{ + u32 logical_mask = 0; + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + + for_each_hw_engine(hwe, gt, id) { + if (hwe->class != XE_ENGINE_CLASS_COPY) + continue; + + if (!XE_WA(gt, 16017236439) || + xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1) + logical_mask |= BIT(hwe->logical_instance); + } + + return logical_mask; +} + +/** + * xe_migrate_init() - Initialize a migrate context + * @tile: Back-pointer to the tile we're initializing for. + * + * Return: Pointer to a migrate context on success. Error pointer on error. + */ +struct xe_migrate *xe_migrate_init(struct xe_tile *tile) +{ + struct xe_device *xe = tile_to_xe(tile); + struct xe_gt *primary_gt = tile->primary_gt; + struct xe_migrate *m; + struct xe_vm *vm; + int err; + + m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL); + if (!m) + return ERR_PTR(-ENOMEM); + + m->tile = tile; + + /* Special layout, prepared below.. */ + vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | + XE_VM_FLAG_SET_TILE_ID(tile)); + if (IS_ERR(vm)) + return ERR_CAST(vm); + + xe_vm_lock(vm, false); + err = xe_migrate_prepare_vm(tile, m, vm); + xe_vm_unlock(vm); + if (err) { + xe_vm_close_and_put(vm); + return ERR_PTR(err); + } + + if (xe->info.has_usm) { + struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, + XE_ENGINE_CLASS_COPY, + primary_gt->usm.reserved_bcs_instance, + false); + u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); + + if (!hwe || !logical_mask) + return ERR_PTR(-EINVAL); + + m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, + EXEC_QUEUE_FLAG_KERNEL | + EXEC_QUEUE_FLAG_PERMANENT); + } else { + m->q = xe_exec_queue_create_class(xe, primary_gt, vm, + XE_ENGINE_CLASS_COPY, + EXEC_QUEUE_FLAG_KERNEL | + EXEC_QUEUE_FLAG_PERMANENT); + } + if (IS_ERR(m->q)) { + xe_vm_close_and_put(vm); + return ERR_CAST(m->q); + } + if (xe->info.has_usm) + m->q->priority = XE_EXEC_QUEUE_PRIORITY_KERNEL; + + mutex_init(&m->job_mutex); + + err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m); + if (err) + return ERR_PTR(err); + + return m; +} + +static u64 max_mem_transfer_per_pass(struct xe_device *xe) +{ + if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe)) + return MAX_CCS_LIMITED_TRANSFER; + + return MAX_PREEMPTDISABLE_TRANSFER; +} + +static u64 xe_migrate_res_sizes(struct xe_device *xe, struct xe_res_cursor *cur) +{ + /* + * For VRAM we use identity mapped pages so we are limited to current + * cursor size. For system we program the pages ourselves so we have no + * such limitation. + */ + return min_t(u64, max_mem_transfer_per_pass(xe), + mem_type_is_vram(cur->mem_type) ? cur->size : + cur->remaining); +} + +static u32 pte_update_size(struct xe_migrate *m, + bool is_vram, + struct ttm_resource *res, + struct xe_res_cursor *cur, + u64 *L0, u64 *L0_ofs, u32 *L0_pt, + u32 cmd_size, u32 pt_ofs, u32 avail_pts) +{ + u32 cmds = 0; + + *L0_pt = pt_ofs; + if (!is_vram) { + /* Clip L0 to available size */ + u64 size = min(*L0, (u64)avail_pts * SZ_2M); + u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); + + *L0 = size; + *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); + + /* MI_STORE_DATA_IMM */ + cmds += 3 * DIV_ROUND_UP(num_4k_pages, 0x1ff); + + /* PDE qwords */ + cmds += num_4k_pages * 2; + + /* Each chunk has a single blit command */ + cmds += cmd_size; + } else { + /* Offset into identity map. */ + *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), + cur->start + vram_region_gpu_offset(res)); + cmds += cmd_size; + } + + return cmds; +} + +static void emit_pte(struct xe_migrate *m, + struct xe_bb *bb, u32 at_pt, + bool is_vram, bool is_comp_pte, + struct xe_res_cursor *cur, + u32 size, struct xe_bo *bo) +{ + struct xe_device *xe = tile_to_xe(m->tile); + + u16 pat_index; + u32 ptes; + u64 ofs = at_pt * XE_PAGE_SIZE; + u64 cur_ofs; + + /* Indirect access needs compression enabled uncached PAT index */ + if (GRAPHICS_VERx100(xe) >= 2000) + pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] : + xe->pat.idx[XE_CACHE_NONE]; + else + pat_index = xe->pat.idx[XE_CACHE_WB]; + + /* + * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently + * we're only emitting VRAM PTEs during sanity tests, so when + * that's moved to a Kunit test, we should condition VRAM PTEs + * on running tests. + */ + + ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); + + while (ptes) { + u32 chunk = min(0x1ffU, ptes); + + bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); + bb->cs[bb->len++] = ofs; + bb->cs[bb->len++] = 0; + + cur_ofs = ofs; + ofs += chunk * 8; + ptes -= chunk; + + while (chunk--) { + u64 addr, flags = 0; + bool devmem = false; + + addr = xe_res_dma(cur) & PAGE_MASK; + if (is_vram) { + /* Is this a 64K PTE entry? */ + if ((m->q->vm->flags & XE_VM_FLAG_64K) && + !(cur_ofs & (16 * 8 - 1))) { + xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K)); + flags |= XE_PTE_PS64; + } + + addr += vram_region_gpu_offset(bo->ttm.resource); + devmem = true; + } + + addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, + addr, pat_index, + 0, devmem, flags); + bb->cs[bb->len++] = lower_32_bits(addr); + bb->cs[bb->len++] = upper_32_bits(addr); + + xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); + cur_ofs += 8; + } + } +} + +#define EMIT_COPY_CCS_DW 5 +static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, + u64 dst_ofs, bool dst_is_indirect, + u64 src_ofs, bool src_is_indirect, + u32 size) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 *cs = bb->cs + bb->len; + u32 num_ccs_blks; + u32 num_pages; + u32 ccs_copy_size; + u32 mocs; + + if (GRAPHICS_VERx100(xe) >= 2000) { + num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); + xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1)); + + ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1); + mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); + + } else { + num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), + NUM_CCS_BYTES_PER_BLOCK); + xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1)); + + ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1); + mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); + } + + *cs++ = XY_CTRL_SURF_COPY_BLT | + (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | + (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | + ccs_copy_size; + *cs++ = lower_32_bits(src_ofs); + *cs++ = upper_32_bits(src_ofs) | mocs; + *cs++ = lower_32_bits(dst_ofs); + *cs++ = upper_32_bits(dst_ofs) | mocs; + + bb->len = cs - bb->cs; +} + +#define EMIT_COPY_DW 10 +static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, + u64 src_ofs, u64 dst_ofs, unsigned int size, + unsigned int pitch) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 mocs = 0; + u32 tile_y = 0; + + xe_gt_assert(gt, size / pitch <= S16_MAX); + xe_gt_assert(gt, pitch / 4 <= S16_MAX); + xe_gt_assert(gt, pitch <= U16_MAX); + + if (GRAPHICS_VER(xe) >= 20) + mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); + + if (GRAPHICS_VERx100(xe) >= 1250) + tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; + + bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); + bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; + bb->cs[bb->len++] = 0; + bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; + bb->cs[bb->len++] = lower_32_bits(dst_ofs); + bb->cs[bb->len++] = upper_32_bits(dst_ofs); + bb->cs[bb->len++] = 0; + bb->cs[bb->len++] = pitch | mocs; + bb->cs[bb->len++] = lower_32_bits(src_ofs); + bb->cs[bb->len++] = upper_32_bits(src_ofs); +} + +static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv, + enum dma_resv_usage usage) +{ + return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage); +} + +static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) +{ + return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; +} + +static u32 xe_migrate_ccs_copy(struct xe_migrate *m, + struct xe_bb *bb, + u64 src_ofs, bool src_is_indirect, + u64 dst_ofs, bool dst_is_indirect, u32 dst_size, + u64 ccs_ofs, bool copy_ccs) +{ + struct xe_gt *gt = m->tile->primary_gt; + u32 flush_flags = 0; + + if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_indirect) { + /* + * If the src is already in vram, then it should already + * have been cleared by us, or has been populated by the + * user. Make sure we copy the CCS aux state as-is. + * + * Otherwise if the bo doesn't have any CCS metadata attached, + * we still need to clear it for security reasons. + */ + u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs; + + emit_copy_ccs(gt, bb, + dst_ofs, true, + ccs_src_ofs, src_is_indirect, dst_size); + + flush_flags = MI_FLUSH_DW_CCS; + } else if (copy_ccs) { + if (!src_is_indirect) + src_ofs = ccs_ofs; + else if (!dst_is_indirect) + dst_ofs = ccs_ofs; + + xe_gt_assert(gt, src_is_indirect || dst_is_indirect); + + emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs, + src_is_indirect, dst_size); + if (dst_is_indirect) + flush_flags = MI_FLUSH_DW_CCS; + } + + return flush_flags; +} + +/** + * xe_migrate_copy() - Copy content of TTM resources. + * @m: The migration context. + * @src_bo: The buffer object @src is currently bound to. + * @dst_bo: If copying between resources created for the same bo, set this to + * the same value as @src_bo. If copying between buffer objects, set it to + * the buffer object @dst is currently bound to. + * @src: The source TTM resource. + * @dst: The dst TTM resource. + * @copy_only_ccs: If true copy only CCS metadata + * + * Copies the contents of @src to @dst: On flat CCS devices, + * the CCS metadata is copied as well if needed, or if not present, + * the CCS metadata of @dst is cleared for security reasons. + * + * Return: Pointer to a dma_fence representing the last copy batch, or + * an error pointer on failure. If there is a failure, any copy operation + * started by the function call has been synced. + */ +struct dma_fence *xe_migrate_copy(struct xe_migrate *m, + struct xe_bo *src_bo, + struct xe_bo *dst_bo, + struct ttm_resource *src, + struct ttm_resource *dst, + bool copy_only_ccs) +{ + struct xe_gt *gt = m->tile->primary_gt; + struct xe_device *xe = gt_to_xe(gt); + struct dma_fence *fence = NULL; + u64 size = src_bo->size; + struct xe_res_cursor src_it, dst_it, ccs_it; + u64 src_L0_ofs, dst_L0_ofs; + u32 src_L0_pt, dst_L0_pt; + u64 src_L0, dst_L0; + int pass = 0; + int err; + bool src_is_pltt = src->mem_type == XE_PL_TT; + bool dst_is_pltt = dst->mem_type == XE_PL_TT; + bool src_is_vram = mem_type_is_vram(src->mem_type); + bool dst_is_vram = mem_type_is_vram(dst->mem_type); + bool copy_ccs = xe_device_has_flat_ccs(xe) && + xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); + bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); + + /* Copying CCS between two different BOs is not supported yet. */ + if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) + return ERR_PTR(-EINVAL); + + if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size)) + return ERR_PTR(-EINVAL); + + if (!src_is_vram) + xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); + else + xe_res_first(src, 0, size, &src_it); + if (!dst_is_vram) + xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it); + else + xe_res_first(dst, 0, size, &dst_it); + + if (copy_system_ccs) + xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), + PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), + &ccs_it); + + while (size) { + u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ + struct xe_sched_job *job; + struct xe_bb *bb; + u32 flush_flags; + u32 update_idx; + u64 ccs_ofs, ccs_size; + u32 ccs_pt; + + bool usm = xe->info.has_usm; + u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; + + src_L0 = xe_migrate_res_sizes(xe, &src_it); + dst_L0 = xe_migrate_res_sizes(xe, &dst_it); + + drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", + pass++, src_L0, dst_L0); + + src_L0 = min(src_L0, dst_L0); + + batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0, + &src_L0_ofs, &src_L0_pt, 0, 0, + avail_pts); + + batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0, + &dst_L0_ofs, &dst_L0_pt, 0, + avail_pts, avail_pts); + + if (copy_system_ccs) { + ccs_size = xe_device_ccs_bytes(xe, src_L0); + batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size, + &ccs_ofs, &ccs_pt, 0, + 2 * avail_pts, + avail_pts); + } + + /* Add copy commands size here */ + batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) + + ((xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0)); + + bb = xe_bb_new(gt, batch_size, usm); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + goto err_sync; + } + + if (!src_is_vram) + emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0, + src_bo); + else + xe_res_next(&src_it, src_L0); + + if (!dst_is_vram) + emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0, + dst_bo); + else + xe_res_next(&dst_it, src_L0); + + if (copy_system_ccs) + emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src_bo); + + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + update_idx = bb->len; + + if (!copy_only_ccs) + emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE); + + flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, + IS_DGFX(xe) ? src_is_vram : src_is_pltt, + dst_L0_ofs, + IS_DGFX(xe) ? dst_is_vram : dst_is_pltt, + src_L0, ccs_ofs, copy_ccs); + + mutex_lock(&m->job_mutex); + job = xe_bb_create_migration_job(m->q, bb, + xe_migrate_batch_base(m, usm), + update_idx); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto err; + } + + xe_sched_job_add_migrate_flush(job, flush_flags); + if (!fence) { + err = job_add_deps(job, src_bo->ttm.base.resv, + DMA_RESV_USAGE_BOOKKEEP); + if (!err && src_bo != dst_bo) + err = job_add_deps(job, dst_bo->ttm.base.resv, + DMA_RESV_USAGE_BOOKKEEP); + if (err) + goto err_job; + } + + xe_sched_job_arm(job); + dma_fence_put(fence); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + dma_fence_put(m->fence); + m->fence = dma_fence_get(fence); + + mutex_unlock(&m->job_mutex); + + xe_bb_free(bb, fence); + size -= src_L0; + continue; + +err_job: + xe_sched_job_put(job); +err: + mutex_unlock(&m->job_mutex); + xe_bb_free(bb, NULL); + +err_sync: + /* Sync partial copy if any. FIXME: under job_mutex? */ + if (fence) { + dma_fence_wait(fence, false); + dma_fence_put(fence); + } + + return ERR_PTR(err); + } + + return fence; +} + +static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, + u32 size, u32 pitch) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 *cs = bb->cs + bb->len; + u32 len = PVC_MEM_SET_CMD_LEN_DW; + + *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); + *cs++ = pitch - 1; + *cs++ = (size / pitch) - 1; + *cs++ = pitch - 1; + *cs++ = lower_32_bits(src_ofs); + *cs++ = upper_32_bits(src_ofs); + if (GRAPHICS_VERx100(xe) >= 2000) + *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); + else + *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); + + xe_gt_assert(gt, cs - bb->cs == len + bb->len); + + bb->len += len; +} + +static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, + u64 src_ofs, u32 size, u32 pitch, bool is_vram) +{ + struct xe_device *xe = gt_to_xe(gt); + u32 *cs = bb->cs + bb->len; + u32 len = XY_FAST_COLOR_BLT_DW; + + if (GRAPHICS_VERx100(xe) < 1250) + len = 11; + + *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | + (len - 2); + if (GRAPHICS_VERx100(xe) >= 2000) + *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | + (pitch - 1); + else + *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | + (pitch - 1); + *cs++ = 0; + *cs++ = (size / pitch) << 16 | pitch / 4; + *cs++ = lower_32_bits(src_ofs); + *cs++ = upper_32_bits(src_ofs); + *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + + if (len > 11) { + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = 0; + } + + xe_gt_assert(gt, cs - bb->cs == len + bb->len); + + bb->len += len; +} + +static bool has_service_copy_support(struct xe_gt *gt) +{ + /* + * What we care about is whether the architecture was designed with + * service copy functionality (specifically the new MEM_SET / MEM_COPY + * instructions) so check the architectural engine list rather than the + * actual list since these instructions are usable on BCS0 even if + * all of the actual service copy engines (BCS1-BCS8) have been fused + * off. + */ + return gt->info.__engine_mask & GENMASK(XE_HW_ENGINE_BCS8, + XE_HW_ENGINE_BCS1); +} + +static u32 emit_clear_cmd_len(struct xe_gt *gt) +{ + if (has_service_copy_support(gt)) + return PVC_MEM_SET_CMD_LEN_DW; + else + return XY_FAST_COLOR_BLT_DW; +} + +static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, + u32 size, u32 pitch, bool is_vram) +{ + if (has_service_copy_support(gt)) + emit_clear_link_copy(gt, bb, src_ofs, size, pitch); + else + emit_clear_main_copy(gt, bb, src_ofs, size, pitch, + is_vram); +} + +/** + * xe_migrate_clear() - Copy content of TTM resources. + * @m: The migration context. + * @bo: The buffer object @dst is currently bound to. + * @dst: The dst TTM resource to be cleared. + * + * Clear the contents of @dst to zero. On flat CCS devices, + * the CCS metadata is cleared to zero as well on VRAM destinations. + * TODO: Eliminate the @bo argument. + * + * Return: Pointer to a dma_fence representing the last clear batch, or + * an error pointer on failure. If there is a failure, any clear operation + * started by the function call has been synced. + */ +struct dma_fence *xe_migrate_clear(struct xe_migrate *m, + struct xe_bo *bo, + struct ttm_resource *dst) +{ + bool clear_vram = mem_type_is_vram(dst->mem_type); + struct xe_gt *gt = m->tile->primary_gt; + struct xe_device *xe = gt_to_xe(gt); + bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) && !IS_DGFX(xe)) ? true : false; + struct dma_fence *fence = NULL; + u64 size = bo->size; + struct xe_res_cursor src_it; + struct ttm_resource *src = dst; + int err; + int pass = 0; + + if (!clear_vram) + xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it); + else + xe_res_first(src, 0, bo->size, &src_it); + + while (size) { + u64 clear_L0_ofs; + u32 clear_L0_pt; + u32 flush_flags = 0; + u64 clear_L0; + struct xe_sched_job *job; + struct xe_bb *bb; + u32 batch_size, update_idx; + + bool usm = xe->info.has_usm; + u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; + + clear_L0 = xe_migrate_res_sizes(xe, &src_it); + + drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0); + + /* Calculate final sizes and batch size.. */ + batch_size = 2 + + pte_update_size(m, clear_vram, src, &src_it, + &clear_L0, &clear_L0_ofs, &clear_L0_pt, + clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0, + avail_pts); + + if (xe_device_has_flat_ccs(xe)) + batch_size += EMIT_COPY_CCS_DW; + + /* Clear commands */ + + if (WARN_ON_ONCE(!clear_L0)) + break; + + bb = xe_bb_new(gt, batch_size, usm); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + goto err_sync; + } + + size -= clear_L0; + /* Preemption is enabled again by the ring ops. */ + if (!clear_vram) { + emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0, + bo); + } else { + xe_res_next(&src_it, clear_L0); + } + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + update_idx = bb->len; + + if (!clear_system_ccs) + emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram); + + if (xe_device_has_flat_ccs(xe)) { + emit_copy_ccs(gt, bb, clear_L0_ofs, true, + m->cleared_mem_ofs, false, clear_L0); + flush_flags = MI_FLUSH_DW_CCS; + } + + mutex_lock(&m->job_mutex); + job = xe_bb_create_migration_job(m->q, bb, + xe_migrate_batch_base(m, usm), + update_idx); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto err; + } + + xe_sched_job_add_migrate_flush(job, flush_flags); + if (!fence) { + /* + * There can't be anything userspace related at this + * point, so we just need to respect any potential move + * fences, which are always tracked as + * DMA_RESV_USAGE_KERNEL. + */ + err = job_add_deps(job, bo->ttm.base.resv, + DMA_RESV_USAGE_KERNEL); + if (err) + goto err_job; + } + + xe_sched_job_arm(job); + dma_fence_put(fence); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + dma_fence_put(m->fence); + m->fence = dma_fence_get(fence); + + mutex_unlock(&m->job_mutex); + + xe_bb_free(bb, fence); + continue; + +err_job: + xe_sched_job_put(job); +err: + mutex_unlock(&m->job_mutex); + xe_bb_free(bb, NULL); +err_sync: + /* Sync partial copies if any. FIXME: job_mutex? */ + if (fence) { + dma_fence_wait(m->fence, false); + dma_fence_put(fence); + } + + return ERR_PTR(err); + } + + if (clear_system_ccs) + bo->ccs_cleared = true; + + return fence; +} + +static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, + const struct xe_vm_pgtable_update *update, + struct xe_migrate_pt_update *pt_update) +{ + const struct xe_migrate_pt_update_ops *ops = pt_update->ops; + u32 chunk; + u32 ofs = update->ofs, size = update->qwords; + + /* + * If we have 512 entries (max), we would populate it ourselves, + * and update the PDE above it to the new pointer. + * The only time this can only happen if we have to update the top + * PDE. This requires a BO that is almost vm->size big. + * + * This shouldn't be possible in practice.. might change when 16K + * pages are used. Hence the assert. + */ + xe_tile_assert(tile, update->qwords <= 0x1ff); + if (!ppgtt_ofs) + ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), + xe_bo_addr(update->pt_bo, 0, + XE_PAGE_SIZE)); + + do { + u64 addr = ppgtt_ofs + ofs * 8; + + chunk = min(update->qwords, 0x1ffU); + + /* Ensure populatefn can do memset64 by aligning bb->cs */ + if (!(bb->len & 1)) + bb->cs[bb->len++] = MI_NOOP; + + bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); + bb->cs[bb->len++] = lower_32_bits(addr); + bb->cs[bb->len++] = upper_32_bits(addr); + ops->populate(pt_update, tile, NULL, bb->cs + bb->len, ofs, chunk, + update); + + bb->len += chunk * 2; + ofs += chunk; + size -= chunk; + } while (size); +} + +struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) +{ + return xe_vm_get(m->q->vm); +} + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) +struct migrate_test_params { + struct xe_test_priv base; + bool force_gpu; +}; + +#define to_migrate_test_params(_priv) \ + container_of(_priv, struct migrate_test_params, base) +#endif + +static struct dma_fence * +xe_migrate_update_pgtables_cpu(struct xe_migrate *m, + struct xe_vm *vm, struct xe_bo *bo, + const struct xe_vm_pgtable_update *updates, + u32 num_updates, bool wait_vm, + struct xe_migrate_pt_update *pt_update) +{ + XE_TEST_DECLARE(struct migrate_test_params *test = + to_migrate_test_params + (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) + const struct xe_migrate_pt_update_ops *ops = pt_update->ops; + struct dma_fence *fence; + int err; + u32 i; + + if (XE_TEST_ONLY(test && test->force_gpu)) + return ERR_PTR(-ETIME); + + if (bo && !dma_resv_test_signaled(bo->ttm.base.resv, + DMA_RESV_USAGE_KERNEL)) + return ERR_PTR(-ETIME); + + if (wait_vm && !dma_resv_test_signaled(xe_vm_resv(vm), + DMA_RESV_USAGE_BOOKKEEP)) + return ERR_PTR(-ETIME); + + if (ops->pre_commit) { + pt_update->job = NULL; + err = ops->pre_commit(pt_update); + if (err) + return ERR_PTR(err); + } + for (i = 0; i < num_updates; i++) { + const struct xe_vm_pgtable_update *update = &updates[i]; + + ops->populate(pt_update, m->tile, &update->pt_bo->vmap, NULL, + update->ofs, update->qwords, update); + } + + if (vm) { + trace_xe_vm_cpu_bind(vm); + xe_device_wmb(vm->xe); + } + + fence = dma_fence_get_stub(); + + return fence; +} + +static bool no_in_syncs(struct xe_vm *vm, struct xe_exec_queue *q, + struct xe_sync_entry *syncs, u32 num_syncs) +{ + struct dma_fence *fence; + int i; + + for (i = 0; i < num_syncs; i++) { + fence = syncs[i].fence; + + if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, + &fence->flags)) + return false; + } + if (q) { + fence = xe_exec_queue_last_fence_get(q, vm); + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) + return false; + } + + return true; +} + +/** + * xe_migrate_update_pgtables() - Pipelined page-table update + * @m: The migrate context. + * @vm: The vm we'll be updating. + * @bo: The bo whose dma-resv we will await before updating, or NULL if userptr. + * @q: The exec queue to be used for the update or NULL if the default + * migration engine is to be used. + * @updates: An array of update descriptors. + * @num_updates: Number of descriptors in @updates. + * @syncs: Array of xe_sync_entry to await before updating. Note that waits + * will block the engine timeline. + * @num_syncs: Number of entries in @syncs. + * @pt_update: Pointer to a struct xe_migrate_pt_update, which contains + * pointers to callback functions and, if subclassed, private arguments to + * those. + * + * Perform a pipelined page-table update. The update descriptors are typically + * built under the same lock critical section as a call to this function. If + * using the default engine for the updates, they will be performed in the + * order they grab the job_mutex. If different engines are used, external + * synchronization is needed for overlapping updates to maintain page-table + * consistency. Note that the meaing of "overlapping" is that the updates + * touch the same page-table, which might be a higher-level page-directory. + * If no pipelining is needed, then updates may be performed by the cpu. + * + * Return: A dma_fence that, when signaled, indicates the update completion. + */ +struct dma_fence * +xe_migrate_update_pgtables(struct xe_migrate *m, + struct xe_vm *vm, + struct xe_bo *bo, + struct xe_exec_queue *q, + const struct xe_vm_pgtable_update *updates, + u32 num_updates, + struct xe_sync_entry *syncs, u32 num_syncs, + struct xe_migrate_pt_update *pt_update) +{ + const struct xe_migrate_pt_update_ops *ops = pt_update->ops; + struct xe_tile *tile = m->tile; + struct xe_gt *gt = tile->primary_gt; + struct xe_device *xe = tile_to_xe(tile); + struct xe_sched_job *job; + struct dma_fence *fence; + struct drm_suballoc *sa_bo = NULL; + struct xe_vma *vma = pt_update->vma; + struct xe_bb *bb; + u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0; + u64 addr; + int err = 0; + bool usm = !q && xe->info.has_usm; + bool first_munmap_rebind = vma && + vma->gpuva.flags & XE_VMA_FIRST_REBIND; + struct xe_exec_queue *q_override = !q ? m->q : q; + u16 pat_index = xe->pat.idx[XE_CACHE_WB]; + + /* Use the CPU if no in syncs and engine is idle */ + if (no_in_syncs(vm, q, syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) { + fence = xe_migrate_update_pgtables_cpu(m, vm, bo, updates, + num_updates, + first_munmap_rebind, + pt_update); + if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN)) + return fence; + } + + /* fixed + PTE entries */ + if (IS_DGFX(xe)) + batch_size = 2; + else + batch_size = 6 + num_updates * 2; + + for (i = 0; i < num_updates; i++) { + u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, 0x1ff); + + /* align noop + MI_STORE_DATA_IMM cmd prefix */ + batch_size += 4 * num_cmds + updates[i].qwords * 2; + } + + /* + * XXX: Create temp bo to copy from, if batch_size becomes too big? + * + * Worst case: Sum(2 * (each lower level page size) + (top level page size)) + * Should be reasonably bound.. + */ + xe_tile_assert(tile, batch_size < SZ_128K); + + bb = xe_bb_new(gt, batch_size, !q && xe->info.has_usm); + if (IS_ERR(bb)) + return ERR_CAST(bb); + + /* For sysmem PTE's, need to map them in our hole.. */ + if (!IS_DGFX(xe)) { + ppgtt_ofs = NUM_KERNEL_PDE - 1; + if (q) { + xe_tile_assert(tile, num_updates <= NUM_VMUSA_WRITES_PER_UNIT); + + sa_bo = drm_suballoc_new(&m->vm_update_sa, 1, + GFP_KERNEL, true, 0); + if (IS_ERR(sa_bo)) { + err = PTR_ERR(sa_bo); + goto err; + } + + ppgtt_ofs = NUM_KERNEL_PDE + + (drm_suballoc_soffset(sa_bo) / + NUM_VMUSA_UNIT_PER_PAGE); + page_ofs = (drm_suballoc_soffset(sa_bo) % + NUM_VMUSA_UNIT_PER_PAGE) * + VM_SA_UPDATE_UNIT_SIZE; + } + + /* Map our PT's to gtt */ + bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(num_updates); + bb->cs[bb->len++] = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; + bb->cs[bb->len++] = 0; /* upper_32_bits */ + + for (i = 0; i < num_updates; i++) { + struct xe_bo *pt_bo = updates[i].pt_bo; + + xe_tile_assert(tile, pt_bo->size == SZ_4K); + + addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, pat_index, 0); + bb->cs[bb->len++] = lower_32_bits(addr); + bb->cs[bb->len++] = upper_32_bits(addr); + } + + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + update_idx = bb->len; + + addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + + (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; + for (i = 0; i < num_updates; i++) + write_pgtable(tile, bb, addr + i * XE_PAGE_SIZE, + &updates[i], pt_update); + } else { + /* phys pages, no preamble required */ + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + update_idx = bb->len; + + for (i = 0; i < num_updates; i++) + write_pgtable(tile, bb, 0, &updates[i], pt_update); + } + + if (!q) + mutex_lock(&m->job_mutex); + + job = xe_bb_create_migration_job(q ?: m->q, bb, + xe_migrate_batch_base(m, usm), + update_idx); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto err_bb; + } + + /* Wait on BO move */ + if (bo) { + err = job_add_deps(job, bo->ttm.base.resv, + DMA_RESV_USAGE_KERNEL); + if (err) + goto err_job; + } + + /* + * Munmap style VM unbind, need to wait for all jobs to be complete / + * trigger preempts before moving forward + */ + if (first_munmap_rebind) { + err = job_add_deps(job, xe_vm_resv(vm), + DMA_RESV_USAGE_BOOKKEEP); + if (err) + goto err_job; + } + + err = xe_sched_job_last_fence_add_dep(job, vm); + for (i = 0; !err && i < num_syncs; i++) + err = xe_sync_entry_add_deps(&syncs[i], job); + + if (err) + goto err_job; + + if (ops->pre_commit) { + pt_update->job = job; + err = ops->pre_commit(pt_update); + if (err) + goto err_job; + } + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + if (!q) + mutex_unlock(&m->job_mutex); + + xe_bb_free(bb, fence); + drm_suballoc_free(sa_bo, fence); + + return fence; + +err_job: + xe_sched_job_put(job); +err_bb: + if (!q) + mutex_unlock(&m->job_mutex); + xe_bb_free(bb, NULL); +err: + drm_suballoc_free(sa_bo, NULL); + return ERR_PTR(err); +} + +/** + * xe_migrate_wait() - Complete all operations using the xe_migrate context + * @m: Migrate context to wait for. + * + * Waits until the GPU no longer uses the migrate context's default engine + * or its page-table objects. FIXME: What about separate page-table update + * engines? + */ +void xe_migrate_wait(struct xe_migrate *m) +{ + if (m->fence) + dma_fence_wait(m->fence, false); +} + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) +#include "tests/xe_migrate.c" +#endif diff --git a/drivers/gpu/drm/xe/xe_migrate.h b/drivers/gpu/drm/xe/xe_migrate.h new file mode 100644 index 000000000000..951f19318ea4 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_migrate.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2020 Intel Corporation + */ + +#ifndef _XE_MIGRATE_ +#define _XE_MIGRATE_ + +#include <drm/drm_mm.h> + +struct dma_fence; +struct iosys_map; +struct ttm_resource; + +struct xe_bo; +struct xe_gt; +struct xe_exec_queue; +struct xe_migrate; +struct xe_migrate_pt_update; +struct xe_sync_entry; +struct xe_pt; +struct xe_tile; +struct xe_vm; +struct xe_vm_pgtable_update; +struct xe_vma; + +/** + * struct xe_migrate_pt_update_ops - Callbacks for the + * xe_migrate_update_pgtables() function. + */ +struct xe_migrate_pt_update_ops { + /** + * @populate: Populate a command buffer or page-table with ptes. + * @pt_update: Embeddable callback argument. + * @tile: The tile for the current operation. + * @map: struct iosys_map into the memory to be populated. + * @pos: If @map is NULL, map into the memory to be populated. + * @ofs: qword offset into @map, unused if @map is NULL. + * @num_qwords: Number of qwords to write. + * @update: Information about the PTEs to be inserted. + * + * This interface is intended to be used as a callback into the + * page-table system to populate command buffers or shared + * page-tables with PTEs. + */ + void (*populate)(struct xe_migrate_pt_update *pt_update, + struct xe_tile *tile, struct iosys_map *map, + void *pos, u32 ofs, u32 num_qwords, + const struct xe_vm_pgtable_update *update); + + /** + * @pre_commit: Callback to be called just before arming the + * sched_job. + * @pt_update: Pointer to embeddable callback argument. + * + * Return: 0 on success, negative error code on error. + */ + int (*pre_commit)(struct xe_migrate_pt_update *pt_update); +}; + +/** + * struct xe_migrate_pt_update - Argument to the + * struct xe_migrate_pt_update_ops callbacks. + * + * Intended to be subclassed to support additional arguments if necessary. + */ +struct xe_migrate_pt_update { + /** @ops: Pointer to the struct xe_migrate_pt_update_ops callbacks */ + const struct xe_migrate_pt_update_ops *ops; + /** @vma: The vma we're updating the pagetable for. */ + struct xe_vma *vma; + /** @job: The job if a GPU page-table update. NULL otherwise */ + struct xe_sched_job *job; + /** @start: Start of update for the range fence */ + u64 start; + /** @last: Last of update for the range fence */ + u64 last; + /** @tile_id: Tile ID of the update */ + u8 tile_id; +}; + +struct xe_migrate *xe_migrate_init(struct xe_tile *tile); + +struct dma_fence *xe_migrate_copy(struct xe_migrate *m, + struct xe_bo *src_bo, + struct xe_bo *dst_bo, + struct ttm_resource *src, + struct ttm_resource *dst, + bool copy_only_ccs); + +struct dma_fence *xe_migrate_clear(struct xe_migrate *m, + struct xe_bo *bo, + struct ttm_resource *dst); + +struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m); + +struct dma_fence * +xe_migrate_update_pgtables(struct xe_migrate *m, + struct xe_vm *vm, + struct xe_bo *bo, + struct xe_exec_queue *q, + const struct xe_vm_pgtable_update *updates, + u32 num_updates, + struct xe_sync_entry *syncs, u32 num_syncs, + struct xe_migrate_pt_update *pt_update); + +void xe_migrate_wait(struct xe_migrate *m); + +struct xe_exec_queue *xe_tile_migrate_engine(struct xe_tile *tile); +#endif diff --git a/drivers/gpu/drm/xe/xe_migrate_doc.h b/drivers/gpu/drm/xe/xe_migrate_doc.h new file mode 100644 index 000000000000..63c7d67b5b62 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_migrate_doc.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_MIGRATE_DOC_H_ +#define _XE_MIGRATE_DOC_H_ + +/** + * DOC: Migrate Layer + * + * The XE migrate layer is used generate jobs which can copy memory (eviction), + * clear memory, or program tables (binds). This layer exists in every GT, has + * a migrate engine, and uses a special VM for all generated jobs. + * + * Special VM details + * ================== + * + * The special VM is configured with a page structure where we can dynamically + * map BOs which need to be copied and cleared, dynamically map other VM's page + * table BOs for updates, and identity map the entire device's VRAM with 1 GB + * pages. + * + * Currently the page structure consists of 32 physical pages with 16 being + * reserved for BO mapping during copies and clear, 1 reserved for kernel binds, + * several pages are needed to setup the identity mappings (exact number based + * on how many bits of address space the device has), and the rest are reserved + * user bind operations. + * + * TODO: Diagram of layout + * + * Bind jobs + * ========= + * + * A bind job consist of two batches and runs either on the migrate engine + * (kernel binds) or the bind engine passed in (user binds). In both cases the + * VM of the engine is the migrate VM. + * + * The first batch is used to update the migration VM page structure to point to + * the bind VM page table BOs which need to be updated. A physical page is + * required for this. If it is a user bind, the page is allocated from pool of + * pages reserved user bind operations with drm_suballoc managing this pool. If + * it is a kernel bind, the page reserved for kernel binds is used. + * + * The first batch is only required for devices without VRAM as when the device + * has VRAM the bind VM page table BOs are in VRAM and the identity mapping can + * be used. + * + * The second batch is used to program page table updated in the bind VM. Why + * not just one batch? Well the TLBs need to be invalidated between these two + * batches and that only can be done from the ring. + * + * When the bind job complete, the page allocated is returned the pool of pages + * reserved for user bind operations if a user bind. No need do this for kernel + * binds as the reserved kernel page is serially used by each job. + * + * Copy / clear jobs + * ================= + * + * A copy or clear job consist of two batches and runs on the migrate engine. + * + * Like binds, the first batch is used update the migration VM page structure. + * In copy jobs, we need to map the source and destination of the BO into page + * the structure. In clear jobs, we just need to add 1 mapping of BO into the + * page structure. We use the 16 reserved pages in migration VM for mappings, + * this gives us a maximum copy size of 16 MB and maximum clear size of 32 MB. + * + * The second batch is used do either do the copy or clear. Again similar to + * binds, two batches are required as the TLBs need to be invalidated from the + * ring between the batches. + * + * More than one job will be generated if the BO is larger than maximum copy / + * clear size. + * + * Future work + * =========== + * + * Update copy and clear code to use identity mapped VRAM. + * + * Can we rework the use of the pages async binds to use all the entries in each + * page? + * + * Using large pages for sysmem mappings. + * + * Is it possible to identity map the sysmem? We should explore this. + */ + +#endif diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c new file mode 100644 index 000000000000..f660cfb79f50 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_mmio.c @@ -0,0 +1,524 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021-2023 Intel Corporation + */ + +#include <linux/minmax.h> + +#include "xe_mmio.h" + +#include <drm/drm_managed.h> +#include <drm/xe_drm.h> + +#include "regs/xe_engine_regs.h" +#include "regs/xe_gt_regs.h" +#include "regs/xe_regs.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_ggtt.h" +#include "xe_gt.h" +#include "xe_gt_mcr.h" +#include "xe_macros.h" +#include "xe_module.h" +#include "xe_tile.h" + +#define XEHP_MTCFG_ADDR XE_REG(0x101800) +#define TILE_COUNT REG_GENMASK(15, 8) + +#define BAR_SIZE_SHIFT 20 + +static void +_resize_bar(struct xe_device *xe, int resno, resource_size_t size) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + int bar_size = pci_rebar_bytes_to_size(size); + int ret; + + if (pci_resource_len(pdev, resno)) + pci_release_resource(pdev, resno); + + ret = pci_resize_resource(pdev, resno, bar_size); + if (ret) { + drm_info(&xe->drm, "Failed to resize BAR%d to %dM (%pe). Consider enabling 'Resizable BAR' support in your BIOS\n", + resno, 1 << bar_size, ERR_PTR(ret)); + return; + } + + drm_info(&xe->drm, "BAR%d resized to %dM\n", resno, 1 << bar_size); +} + +/* + * if force_vram_bar_size is set, attempt to set to the requested size + * else set to maximum possible size + */ +static void xe_resize_vram_bar(struct xe_device *xe) +{ + u64 force_vram_bar_size = xe_modparam.force_vram_bar_size; + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct pci_bus *root = pdev->bus; + resource_size_t current_size; + resource_size_t rebar_size; + struct resource *root_res; + u32 bar_size_mask; + u32 pci_cmd; + int i; + + /* gather some relevant info */ + current_size = pci_resource_len(pdev, LMEM_BAR); + bar_size_mask = pci_rebar_get_possible_sizes(pdev, LMEM_BAR); + + if (!bar_size_mask) + return; + + /* set to a specific size? */ + if (force_vram_bar_size) { + u32 bar_size_bit; + + rebar_size = force_vram_bar_size * (resource_size_t)SZ_1M; + + bar_size_bit = bar_size_mask & BIT(pci_rebar_bytes_to_size(rebar_size)); + + if (!bar_size_bit) { + drm_info(&xe->drm, + "Requested size: %lluMiB is not supported by rebar sizes: 0x%x. Leaving default: %lluMiB\n", + (u64)rebar_size >> 20, bar_size_mask, (u64)current_size >> 20); + return; + } + + rebar_size = 1ULL << (__fls(bar_size_bit) + BAR_SIZE_SHIFT); + + if (rebar_size == current_size) + return; + } else { + rebar_size = 1ULL << (__fls(bar_size_mask) + BAR_SIZE_SHIFT); + + /* only resize if larger than current */ + if (rebar_size <= current_size) + return; + } + + drm_info(&xe->drm, "Attempting to resize bar from %lluMiB -> %lluMiB\n", + (u64)current_size >> 20, (u64)rebar_size >> 20); + + while (root->parent) + root = root->parent; + + pci_bus_for_each_resource(root, root_res, i) { + if (root_res && root_res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && + root_res->start > 0x100000000ull) + break; + } + + if (!root_res) { + drm_info(&xe->drm, "Can't resize VRAM BAR - platform support is missing. Consider enabling 'Resizable BAR' support in your BIOS\n"); + return; + } + + pci_read_config_dword(pdev, PCI_COMMAND, &pci_cmd); + pci_write_config_dword(pdev, PCI_COMMAND, pci_cmd & ~PCI_COMMAND_MEMORY); + + _resize_bar(xe, LMEM_BAR, rebar_size); + + pci_assign_unassigned_bus_resources(pdev->bus); + pci_write_config_dword(pdev, PCI_COMMAND, pci_cmd); +} + +static bool xe_pci_resource_valid(struct pci_dev *pdev, int bar) +{ + if (!pci_resource_flags(pdev, bar)) + return false; + + if (pci_resource_flags(pdev, bar) & IORESOURCE_UNSET) + return false; + + if (!pci_resource_len(pdev, bar)) + return false; + + return true; +} + +static int xe_determine_lmem_bar_size(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + + if (!xe_pci_resource_valid(pdev, LMEM_BAR)) { + drm_err(&xe->drm, "pci resource is not valid\n"); + return -ENXIO; + } + + xe_resize_vram_bar(xe); + + xe->mem.vram.io_start = pci_resource_start(pdev, LMEM_BAR); + xe->mem.vram.io_size = pci_resource_len(pdev, LMEM_BAR); + if (!xe->mem.vram.io_size) + return -EIO; + + /* XXX: Need to change when xe link code is ready */ + xe->mem.vram.dpa_base = 0; + + /* set up a map to the total memory area. */ + xe->mem.vram.mapping = ioremap_wc(xe->mem.vram.io_start, xe->mem.vram.io_size); + + return 0; +} + +/** + * xe_mmio_tile_vram_size() - Collect vram size and offset information + * @tile: tile to get info for + * @vram_size: available vram (size - device reserved portions) + * @tile_size: actual vram size + * @tile_offset: physical start point in the vram address space + * + * There are 4 places for size information: + * - io size (from pci_resource_len of LMEM bar) (only used for small bar and DG1) + * - TILEx size (actual vram size) + * - GSMBASE offset (TILEx - "stolen") + * - CSSBASE offset (TILEx - CSS space necessary) + * + * CSSBASE is always a lower/smaller offset then GSMBASE. + * + * The actual available size of memory is to the CCS or GSM base. + * NOTE: multi-tile bases will include the tile offset. + * + */ +static int xe_mmio_tile_vram_size(struct xe_tile *tile, u64 *vram_size, + u64 *tile_size, u64 *tile_offset) +{ + struct xe_device *xe = tile_to_xe(tile); + struct xe_gt *gt = tile->primary_gt; + u64 offset; + int err; + u32 reg; + + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + return err; + + /* actual size */ + if (unlikely(xe->info.platform == XE_DG1)) { + *tile_size = pci_resource_len(to_pci_dev(xe->drm.dev), LMEM_BAR); + *tile_offset = 0; + } else { + reg = xe_gt_mcr_unicast_read_any(gt, XEHP_TILE_ADDR_RANGE(gt->info.id)); + *tile_size = (u64)REG_FIELD_GET(GENMASK(14, 8), reg) * SZ_1G; + *tile_offset = (u64)REG_FIELD_GET(GENMASK(7, 1), reg) * SZ_1G; + } + + /* minus device usage */ + if (xe->info.has_flat_ccs) { + reg = xe_gt_mcr_unicast_read_any(gt, XEHP_FLAT_CCS_BASE_ADDR); + offset = (u64)REG_FIELD_GET(GENMASK(31, 8), reg) * SZ_64K; + } else { + offset = xe_mmio_read64_2x32(gt, GSMBASE); + } + + /* remove the tile offset so we have just the available size */ + *vram_size = offset - *tile_offset; + + return xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +} + +int xe_mmio_probe_vram(struct xe_device *xe) +{ + struct xe_tile *tile; + resource_size_t io_size; + u64 available_size = 0; + u64 total_size = 0; + u64 tile_offset; + u64 tile_size; + u64 vram_size; + int err; + u8 id; + + if (!IS_DGFX(xe)) + return 0; + + /* Get the size of the root tile's vram for later accessibility comparison */ + tile = xe_device_get_root_tile(xe); + err = xe_mmio_tile_vram_size(tile, &vram_size, &tile_size, &tile_offset); + if (err) + return err; + + err = xe_determine_lmem_bar_size(xe); + if (err) + return err; + + drm_info(&xe->drm, "VISIBLE VRAM: %pa, %pa\n", &xe->mem.vram.io_start, + &xe->mem.vram.io_size); + + io_size = xe->mem.vram.io_size; + + /* tile specific ranges */ + for_each_tile(tile, xe, id) { + err = xe_mmio_tile_vram_size(tile, &vram_size, &tile_size, &tile_offset); + if (err) + return err; + + tile->mem.vram.actual_physical_size = tile_size; + tile->mem.vram.io_start = xe->mem.vram.io_start + tile_offset; + tile->mem.vram.io_size = min_t(u64, vram_size, io_size); + + if (!tile->mem.vram.io_size) { + drm_err(&xe->drm, "Tile without any CPU visible VRAM. Aborting.\n"); + return -ENODEV; + } + + tile->mem.vram.dpa_base = xe->mem.vram.dpa_base + tile_offset; + tile->mem.vram.usable_size = vram_size; + tile->mem.vram.mapping = xe->mem.vram.mapping + tile_offset; + + if (tile->mem.vram.io_size < tile->mem.vram.usable_size) + drm_info(&xe->drm, "Small BAR device\n"); + drm_info(&xe->drm, "VRAM[%u, %u]: Actual physical size %pa, usable size exclude stolen %pa, CPU accessible size %pa\n", id, + tile->id, &tile->mem.vram.actual_physical_size, &tile->mem.vram.usable_size, &tile->mem.vram.io_size); + drm_info(&xe->drm, "VRAM[%u, %u]: DPA range: [%pa-%llx], io range: [%pa-%llx]\n", id, tile->id, + &tile->mem.vram.dpa_base, tile->mem.vram.dpa_base + tile->mem.vram.actual_physical_size, + &tile->mem.vram.io_start, tile->mem.vram.io_start + tile->mem.vram.io_size); + + /* calculate total size using tile size to get the correct HW sizing */ + total_size += tile_size; + available_size += vram_size; + + if (total_size > xe->mem.vram.io_size) { + drm_info(&xe->drm, "VRAM: %pa is larger than resource %pa\n", + &total_size, &xe->mem.vram.io_size); + } + + io_size -= min_t(u64, tile_size, io_size); + } + + xe->mem.vram.actual_physical_size = total_size; + + drm_info(&xe->drm, "Total VRAM: %pa, %pa\n", &xe->mem.vram.io_start, + &xe->mem.vram.actual_physical_size); + drm_info(&xe->drm, "Available VRAM: %pa, %pa\n", &xe->mem.vram.io_start, + &available_size); + + return 0; +} + +void xe_mmio_probe_tiles(struct xe_device *xe) +{ + size_t tile_mmio_size = SZ_16M, tile_mmio_ext_size = xe->info.tile_mmio_ext_size; + u8 id, tile_count = xe->info.tile_count; + struct xe_gt *gt = xe_root_mmio_gt(xe); + struct xe_tile *tile; + void *regs; + u32 mtcfg; + + if (tile_count == 1) + goto add_mmio_ext; + + if (!xe->info.skip_mtcfg) { + mtcfg = xe_mmio_read64_2x32(gt, XEHP_MTCFG_ADDR); + tile_count = REG_FIELD_GET(TILE_COUNT, mtcfg) + 1; + if (tile_count < xe->info.tile_count) { + drm_info(&xe->drm, "tile_count: %d, reduced_tile_count %d\n", + xe->info.tile_count, tile_count); + xe->info.tile_count = tile_count; + + /* + * FIXME: Needs some work for standalone media, but should be impossible + * with multi-tile for now. + */ + xe->info.gt_count = xe->info.tile_count; + } + } + + regs = xe->mmio.regs; + for_each_tile(tile, xe, id) { + tile->mmio.size = tile_mmio_size; + tile->mmio.regs = regs; + regs += tile_mmio_size; + } + +add_mmio_ext: + /* + * By design, there's a contiguous multi-tile MMIO space (16MB hard coded per tile). + * When supported, there could be an additional contiguous multi-tile MMIO extension + * space ON TOP of it, and hence the necessity for distinguished MMIO spaces. + */ + if (xe->info.has_mmio_ext) { + regs = xe->mmio.regs + tile_mmio_size * tile_count; + + for_each_tile(tile, xe, id) { + tile->mmio_ext.size = tile_mmio_ext_size; + tile->mmio_ext.regs = regs; + + regs += tile_mmio_ext_size; + } + } +} + +static void mmio_fini(struct drm_device *drm, void *arg) +{ + struct xe_device *xe = arg; + + pci_iounmap(to_pci_dev(xe->drm.dev), xe->mmio.regs); + if (xe->mem.vram.mapping) + iounmap(xe->mem.vram.mapping); +} + +static int xe_verify_lmem_ready(struct xe_device *xe) +{ + struct xe_gt *gt = xe_root_mmio_gt(xe); + + /* + * The boot firmware initializes local memory and assesses its health. + * If memory training fails, the punit will have been instructed to + * keep the GT powered down; we won't be able to communicate with it + * and we should not continue with driver initialization. + */ + if (IS_DGFX(xe) && !(xe_mmio_read32(gt, GU_CNTL) & LMEM_INIT)) { + drm_err(&xe->drm, "VRAM not initialized by firmware\n"); + return -ENODEV; + } + + return 0; +} + +int xe_mmio_init(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + const int mmio_bar = 0; + + /* + * Map the entire BAR. + * The first 16MB of the BAR, belong to the root tile, and include: + * registers (0-4MB), reserved space (4MB-8MB) and GGTT (8MB-16MB). + */ + xe->mmio.size = pci_resource_len(pdev, mmio_bar); + xe->mmio.regs = pci_iomap(pdev, mmio_bar, 0); + if (xe->mmio.regs == NULL) { + drm_err(&xe->drm, "failed to map registers\n"); + return -EIO; + } + + return drmm_add_action_or_reset(&xe->drm, mmio_fini, xe); +} + +int xe_mmio_root_tile_init(struct xe_device *xe) +{ + struct xe_tile *root_tile = xe_device_get_root_tile(xe); + int err; + + /* Setup first tile; other tiles (if present) will be setup later. */ + root_tile->mmio.size = SZ_16M; + root_tile->mmio.regs = xe->mmio.regs; + + err = xe_verify_lmem_ready(xe); + if (err) + return err; + + return 0; +} + +/** + * xe_mmio_read64_2x32() - Read a 64-bit register as two 32-bit reads + * @gt: MMIO target GT + * @reg: register to read value from + * + * Although Intel GPUs have some 64-bit registers, the hardware officially + * only supports GTTMMADR register reads of 32 bits or smaller. Even if + * a readq operation may return a reasonable value, that violation of the + * spec shouldn't be relied upon and all 64-bit register reads should be + * performed as two 32-bit reads of the upper and lower dwords. + * + * When reading registers that may be changing (such as + * counters), a rollover of the lower dword between the two 32-bit reads + * can be problematic. This function attempts to ensure the upper dword has + * stabilized before returning the 64-bit value. + * + * Note that because this function may re-read the register multiple times + * while waiting for the value to stabilize it should not be used to read + * any registers where read operations have side effects. + * + * Returns the value of the 64-bit register. + */ +u64 xe_mmio_read64_2x32(struct xe_gt *gt, struct xe_reg reg) +{ + struct xe_reg reg_udw = { .addr = reg.addr + 0x4 }; + u32 ldw, udw, oldudw, retries; + + if (reg.addr < gt->mmio.adj_limit) { + reg.addr += gt->mmio.adj_offset; + reg_udw.addr += gt->mmio.adj_offset; + } + + oldudw = xe_mmio_read32(gt, reg_udw); + for (retries = 5; retries; --retries) { + ldw = xe_mmio_read32(gt, reg); + udw = xe_mmio_read32(gt, reg_udw); + + if (udw == oldudw) + break; + + oldudw = udw; + } + + xe_gt_WARN(gt, retries == 0, + "64-bit read of %#x did not stabilize\n", reg.addr); + + return (u64)udw << 32 | ldw; +} + +/** + * xe_mmio_wait32() - Wait for a register to match the desired masked value + * @gt: MMIO target GT + * @reg: register to read value from + * @mask: mask to be applied to the value read from the register + * @val: desired value after applying the mask + * @timeout_us: time out after this period of time. Wait logic tries to be + * smart, applying an exponential backoff until @timeout_us is reached. + * @out_val: if not NULL, points where to store the last unmasked value + * @atomic: needs to be true if calling from an atomic context + * + * This function polls for the desired masked value and returns zero on success + * or -ETIMEDOUT if timed out. + * + * Note that @timeout_us represents the minimum amount of time to wait before + * giving up. The actual time taken by this function can be a little more than + * @timeout_us for different reasons, specially in non-atomic contexts. Thus, + * it is possible that this function succeeds even after @timeout_us has passed. + */ +int xe_mmio_wait32(struct xe_gt *gt, struct xe_reg reg, u32 mask, u32 val, u32 timeout_us, + u32 *out_val, bool atomic) +{ + ktime_t cur = ktime_get_raw(); + const ktime_t end = ktime_add_us(cur, timeout_us); + int ret = -ETIMEDOUT; + s64 wait = 10; + u32 read; + + for (;;) { + read = xe_mmio_read32(gt, reg); + if ((read & mask) == val) { + ret = 0; + break; + } + + cur = ktime_get_raw(); + if (!ktime_before(cur, end)) + break; + + if (ktime_after(ktime_add_us(cur, wait), end)) + wait = ktime_us_delta(end, cur); + + if (atomic) + udelay(wait); + else + usleep_range(wait, wait << 1); + wait <<= 1; + } + + if (ret != 0) { + read = xe_mmio_read32(gt, reg); + if ((read & mask) == val) + ret = 0; + } + + if (out_val) + *out_val = read; + + return ret; +} diff --git a/drivers/gpu/drm/xe/xe_mmio.h b/drivers/gpu/drm/xe/xe_mmio.h new file mode 100644 index 000000000000..98de5c13c89b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_mmio.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021-2023 Intel Corporation + */ + +#ifndef _XE_MMIO_H_ +#define _XE_MMIO_H_ + +#include <linux/delay.h> +#include <linux/io-64-nonatomic-lo-hi.h> + +#include "regs/xe_reg_defs.h" +#include "xe_device_types.h" +#include "xe_gt_printk.h" +#include "xe_gt_types.h" + +struct drm_device; +struct drm_file; +struct xe_device; + +#define LMEM_BAR 2 + +int xe_mmio_init(struct xe_device *xe); +int xe_mmio_root_tile_init(struct xe_device *xe); +void xe_mmio_probe_tiles(struct xe_device *xe); + +static inline u8 xe_mmio_read8(struct xe_gt *gt, struct xe_reg reg) +{ + struct xe_tile *tile = gt_to_tile(gt); + + if (reg.addr < gt->mmio.adj_limit) + reg.addr += gt->mmio.adj_offset; + + return readb((reg.ext ? tile->mmio_ext.regs : tile->mmio.regs) + reg.addr); +} + +static inline u16 xe_mmio_read16(struct xe_gt *gt, struct xe_reg reg) +{ + struct xe_tile *tile = gt_to_tile(gt); + + if (reg.addr < gt->mmio.adj_limit) + reg.addr += gt->mmio.adj_offset; + + return readw((reg.ext ? tile->mmio_ext.regs : tile->mmio.regs) + reg.addr); +} + +static inline void xe_mmio_write32(struct xe_gt *gt, + struct xe_reg reg, u32 val) +{ + struct xe_tile *tile = gt_to_tile(gt); + + if (reg.addr < gt->mmio.adj_limit) + reg.addr += gt->mmio.adj_offset; + + writel(val, (reg.ext ? tile->mmio_ext.regs : tile->mmio.regs) + reg.addr); +} + +static inline u32 xe_mmio_read32(struct xe_gt *gt, struct xe_reg reg) +{ + struct xe_tile *tile = gt_to_tile(gt); + + if (reg.addr < gt->mmio.adj_limit) + reg.addr += gt->mmio.adj_offset; + + return readl((reg.ext ? tile->mmio_ext.regs : tile->mmio.regs) + reg.addr); +} + +static inline u32 xe_mmio_rmw32(struct xe_gt *gt, struct xe_reg reg, u32 clr, + u32 set) +{ + u32 old, reg_val; + + old = xe_mmio_read32(gt, reg); + reg_val = (old & ~clr) | set; + xe_mmio_write32(gt, reg, reg_val); + + return old; +} + +static inline int xe_mmio_write32_and_verify(struct xe_gt *gt, + struct xe_reg reg, u32 val, + u32 mask, u32 eval) +{ + u32 reg_val; + + xe_mmio_write32(gt, reg, val); + reg_val = xe_mmio_read32(gt, reg); + + return (reg_val & mask) != eval ? -EINVAL : 0; +} + +static inline bool xe_mmio_in_range(const struct xe_gt *gt, + const struct xe_mmio_range *range, + struct xe_reg reg) +{ + if (reg.addr < gt->mmio.adj_limit) + reg.addr += gt->mmio.adj_offset; + + return range && reg.addr >= range->start && reg.addr <= range->end; +} + +int xe_mmio_probe_vram(struct xe_device *xe); +u64 xe_mmio_read64_2x32(struct xe_gt *gt, struct xe_reg reg); +int xe_mmio_wait32(struct xe_gt *gt, struct xe_reg reg, u32 mask, u32 val, u32 timeout_us, + u32 *out_val, bool atomic); + +#endif diff --git a/drivers/gpu/drm/xe/xe_mocs.c b/drivers/gpu/drm/xe/xe_mocs.c new file mode 100644 index 000000000000..ef79552e4f2f --- /dev/null +++ b/drivers/gpu/drm/xe/xe_mocs.c @@ -0,0 +1,580 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_mocs.h" + +#include "regs/xe_gt_regs.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_exec_queue.h" +#include "xe_gt.h" +#include "xe_gt_mcr.h" +#include "xe_mmio.h" +#include "xe_platform_types.h" +#include "xe_step_types.h" + +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG) +#define mocs_dbg drm_dbg +#else +__printf(2, 3) +static inline void mocs_dbg(const struct drm_device *dev, + const char *format, ...) +{ /* noop */ } +#endif + +enum { + HAS_GLOBAL_MOCS = BIT(0), + HAS_LNCF_MOCS = BIT(1), +}; + +struct xe_mocs_entry { + u32 control_value; + u16 l3cc_value; + u16 used; +}; + +struct xe_mocs_info { + unsigned int size; + unsigned int n_entries; + const struct xe_mocs_entry *table; + u8 uc_index; + u8 wb_index; + u8 unused_entries_index; +}; + +/* Defines for the tables (XXX_MOCS_0 - XXX_MOCS_63) */ +#define _LE_CACHEABILITY(value) ((value) << 0) +#define _LE_TGT_CACHE(value) ((value) << 2) +#define LE_LRUM(value) ((value) << 4) +#define LE_AOM(value) ((value) << 6) +#define LE_RSC(value) ((value) << 7) +#define LE_SCC(value) ((value) << 8) +#define LE_PFM(value) ((value) << 11) +#define LE_SCF(value) ((value) << 14) +#define LE_COS(value) ((value) << 15) +#define LE_SSE(value) ((value) << 17) + +/* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */ +#define L3_ESC(value) ((value) << 0) +#define L3_SCC(value) ((value) << 1) +#define _L3_CACHEABILITY(value) ((value) << 4) +#define L3_GLBGO(value) ((value) << 6) +#define L3_LKUP(value) ((value) << 7) + +/* Defines for the tables (GLOB_MOCS_0 - GLOB_MOCS_16) */ +#define IG_PAT REG_BIT(8) +#define L3_CACHE_POLICY_MASK REG_GENMASK(5, 4) +#define L4_CACHE_POLICY_MASK REG_GENMASK(3, 2) + +/* Helper defines */ +#define XELP_NUM_MOCS_ENTRIES 64 /* 63-64 are reserved, but configured. */ +#define PVC_NUM_MOCS_ENTRIES 3 +#define MTL_NUM_MOCS_ENTRIES 16 +#define XE2_NUM_MOCS_ENTRIES 16 + +/* (e)LLC caching options */ +/* + * Note: LE_0_PAGETABLE works only up to Gen11; for newer gens it means + * the same as LE_UC + */ +#define LE_0_PAGETABLE _LE_CACHEABILITY(0) +#define LE_1_UC _LE_CACHEABILITY(1) +#define LE_2_WT _LE_CACHEABILITY(2) +#define LE_3_WB _LE_CACHEABILITY(3) + +/* Target cache */ +#define LE_TC_0_PAGETABLE _LE_TGT_CACHE(0) +#define LE_TC_1_LLC _LE_TGT_CACHE(1) +#define LE_TC_2_LLC_ELLC _LE_TGT_CACHE(2) +#define LE_TC_3_LLC_ELLC_ALT _LE_TGT_CACHE(3) + +/* L3 caching options */ +#define L3_0_DIRECT _L3_CACHEABILITY(0) +#define L3_1_UC _L3_CACHEABILITY(1) +#define L3_2_RESERVED _L3_CACHEABILITY(2) +#define L3_3_WB _L3_CACHEABILITY(3) + +/* L4 caching options */ +#define L4_0_WB REG_FIELD_PREP(L4_CACHE_POLICY_MASK, 0) +#define L4_1_WT REG_FIELD_PREP(L4_CACHE_POLICY_MASK, 1) +#define L4_3_UC REG_FIELD_PREP(L4_CACHE_POLICY_MASK, 3) + +#define XE2_L3_0_WB REG_FIELD_PREP(L3_CACHE_POLICY_MASK, 0) +/* XD: WB Transient Display */ +#define XE2_L3_1_XD REG_FIELD_PREP(L3_CACHE_POLICY_MASK, 1) +#define XE2_L3_3_UC REG_FIELD_PREP(L3_CACHE_POLICY_MASK, 3) + +#define MOCS_ENTRY(__idx, __control_value, __l3cc_value) \ + [__idx] = { \ + .control_value = __control_value, \ + .l3cc_value = __l3cc_value, \ + .used = 1, \ + } + +/* + * MOCS tables + * + * These are the MOCS tables that are programmed across all the rings. + * The control value is programmed to all the rings that support the + * MOCS registers. While the l3cc_values are only programmed to the + * LNCFCMOCS0 - LNCFCMOCS32 registers. + * + * These tables are intended to be kept reasonably consistent across + * HW platforms, and for ICL+, be identical across OSes. To achieve + * that, the list of entries is published as part of bspec. + * + * Entries not part of the following tables are undefined as far as userspace is + * concerned and shouldn't be relied upon. The last few entries are reserved by + * the hardware. They should be initialized according to bspec and never used. + * + * NOTE1: These tables are part of bspec and defined as part of the hardware + * interface. It is expected that, for specific hardware platform, existing + * entries will remain constant and the table will only be updated by adding new + * entries, filling unused positions. + * + * NOTE2: Reserved and unspecified MOCS indices have been set to L3 WB. These + * reserved entries should never be used. They may be changed to low performant + * variants with better coherency in the future if more entries are needed. + */ + +static const struct xe_mocs_entry gen12_mocs_desc[] = { + /* Base - L3 + LLC */ + MOCS_ENTRY(2, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_3_WB), + /* Base - Uncached */ + MOCS_ENTRY(3, + LE_1_UC | LE_TC_1_LLC, + L3_1_UC), + /* Base - L3 */ + MOCS_ENTRY(4, + LE_1_UC | LE_TC_1_LLC, + L3_3_WB), + /* Base - LLC */ + MOCS_ENTRY(5, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_1_UC), + /* Age 0 - LLC */ + MOCS_ENTRY(6, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1), + L3_1_UC), + /* Age 0 - L3 + LLC */ + MOCS_ENTRY(7, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1), + L3_3_WB), + /* Age: Don't Chg. - LLC */ + MOCS_ENTRY(8, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2), + L3_1_UC), + /* Age: Don't Chg. - L3 + LLC */ + MOCS_ENTRY(9, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2), + L3_3_WB), + /* No AOM - LLC */ + MOCS_ENTRY(10, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_AOM(1), + L3_1_UC), + /* No AOM - L3 + LLC */ + MOCS_ENTRY(11, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_AOM(1), + L3_3_WB), + /* No AOM; Age 0 - LLC */ + MOCS_ENTRY(12, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1) | LE_AOM(1), + L3_1_UC), + /* No AOM; Age 0 - L3 + LLC */ + MOCS_ENTRY(13, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1) | LE_AOM(1), + L3_3_WB), + /* No AOM; Age:DC - LLC */ + MOCS_ENTRY(14, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2) | LE_AOM(1), + L3_1_UC), + /* No AOM; Age:DC - L3 + LLC */ + MOCS_ENTRY(15, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2) | LE_AOM(1), + L3_3_WB), + /* Self-Snoop - L3 + LLC */ + MOCS_ENTRY(18, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SSE(3), + L3_3_WB), + /* Skip Caching - L3 + LLC(12.5%) */ + MOCS_ENTRY(19, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SCC(7), + L3_3_WB), + /* Skip Caching - L3 + LLC(25%) */ + MOCS_ENTRY(20, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SCC(3), + L3_3_WB), + /* Skip Caching - L3 + LLC(50%) */ + MOCS_ENTRY(21, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SCC(1), + L3_3_WB), + /* Skip Caching - L3 + LLC(75%) */ + MOCS_ENTRY(22, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_RSC(1) | LE_SCC(3), + L3_3_WB), + /* Skip Caching - L3 + LLC(87.5%) */ + MOCS_ENTRY(23, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_RSC(1) | LE_SCC(7), + L3_3_WB), + /* Implicitly enable L1 - HDC:L1 + L3 + LLC */ + MOCS_ENTRY(48, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_3_WB), + /* Implicitly enable L1 - HDC:L1 + L3 */ + MOCS_ENTRY(49, + LE_1_UC | LE_TC_1_LLC, + L3_3_WB), + /* Implicitly enable L1 - HDC:L1 + LLC */ + MOCS_ENTRY(50, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_1_UC), + /* Implicitly enable L1 - HDC:L1 */ + MOCS_ENTRY(51, + LE_1_UC | LE_TC_1_LLC, + L3_1_UC), + /* HW Special Case (CCS) */ + MOCS_ENTRY(60, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_1_UC), + /* HW Special Case (Displayable) */ + MOCS_ENTRY(61, + LE_1_UC | LE_TC_1_LLC, + L3_3_WB), + /* HW Reserved - SW program but never use */ + MOCS_ENTRY(62, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_1_UC), + /* HW Reserved - SW program but never use */ + MOCS_ENTRY(63, + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_1_UC) +}; + +static const struct xe_mocs_entry dg1_mocs_desc[] = { + /* UC */ + MOCS_ENTRY(1, 0, L3_1_UC), + /* WB - L3 */ + MOCS_ENTRY(5, 0, L3_3_WB), + /* WB - L3 50% */ + MOCS_ENTRY(6, 0, L3_ESC(1) | L3_SCC(1) | L3_3_WB), + /* WB - L3 25% */ + MOCS_ENTRY(7, 0, L3_ESC(1) | L3_SCC(3) | L3_3_WB), + /* WB - L3 12.5% */ + MOCS_ENTRY(8, 0, L3_ESC(1) | L3_SCC(7) | L3_3_WB), + + /* HDC:L1 + L3 */ + MOCS_ENTRY(48, 0, L3_3_WB), + /* HDC:L1 */ + MOCS_ENTRY(49, 0, L3_1_UC), + + /* HW Reserved */ + MOCS_ENTRY(60, 0, L3_1_UC), + MOCS_ENTRY(61, 0, L3_1_UC), + MOCS_ENTRY(62, 0, L3_1_UC), + MOCS_ENTRY(63, 0, L3_1_UC), +}; + +static const struct xe_mocs_entry dg2_mocs_desc[] = { + /* UC - Coherent; GO:L3 */ + MOCS_ENTRY(0, 0, L3_1_UC | L3_LKUP(1)), + /* UC - Coherent; GO:Memory */ + MOCS_ENTRY(1, 0, L3_1_UC | L3_GLBGO(1) | L3_LKUP(1)), + /* UC - Non-Coherent; GO:Memory */ + MOCS_ENTRY(2, 0, L3_1_UC | L3_GLBGO(1)), + + /* WB - LC */ + MOCS_ENTRY(3, 0, L3_3_WB | L3_LKUP(1)), +}; + +static const struct xe_mocs_entry dg2_mocs_desc_g10_ax[] = { + /* Wa_14011441408: Set Go to Memory for MOCS#0 */ + MOCS_ENTRY(0, 0, L3_1_UC | L3_GLBGO(1) | L3_LKUP(1)), + /* UC - Coherent; GO:Memory */ + MOCS_ENTRY(1, 0, L3_1_UC | L3_GLBGO(1) | L3_LKUP(1)), + /* UC - Non-Coherent; GO:Memory */ + MOCS_ENTRY(2, 0, L3_1_UC | L3_GLBGO(1)), + + /* WB - LC */ + MOCS_ENTRY(3, 0, L3_3_WB | L3_LKUP(1)), +}; + +static const struct xe_mocs_entry pvc_mocs_desc[] = { + /* Error */ + MOCS_ENTRY(0, 0, L3_3_WB), + + /* UC */ + MOCS_ENTRY(1, 0, L3_1_UC), + + /* WB */ + MOCS_ENTRY(2, 0, L3_3_WB), +}; + +static const struct xe_mocs_entry mtl_mocs_desc[] = { + /* Error - Reserved for Non-Use */ + MOCS_ENTRY(0, + 0, + L3_LKUP(1) | L3_3_WB), + /* Cached - L3 + L4 */ + MOCS_ENTRY(1, + IG_PAT, + L3_LKUP(1) | L3_3_WB), + /* L4 - GO:L3 */ + MOCS_ENTRY(2, + IG_PAT, + L3_LKUP(1) | L3_1_UC), + /* Uncached - GO:L3 */ + MOCS_ENTRY(3, + IG_PAT | L4_3_UC, + L3_LKUP(1) | L3_1_UC), + /* L4 - GO:Mem */ + MOCS_ENTRY(4, + IG_PAT, + L3_LKUP(1) | L3_GLBGO(1) | L3_1_UC), + /* Uncached - GO:Mem */ + MOCS_ENTRY(5, + IG_PAT | L4_3_UC, + L3_LKUP(1) | L3_GLBGO(1) | L3_1_UC), + /* L4 - L3:NoLKUP; GO:L3 */ + MOCS_ENTRY(6, + IG_PAT, + L3_1_UC), + /* Uncached - L3:NoLKUP; GO:L3 */ + MOCS_ENTRY(7, + IG_PAT | L4_3_UC, + L3_1_UC), + /* L4 - L3:NoLKUP; GO:Mem */ + MOCS_ENTRY(8, + IG_PAT, + L3_GLBGO(1) | L3_1_UC), + /* Uncached - L3:NoLKUP; GO:Mem */ + MOCS_ENTRY(9, + IG_PAT | L4_3_UC, + L3_GLBGO(1) | L3_1_UC), + /* Display - L3; L4:WT */ + MOCS_ENTRY(14, + IG_PAT | L4_1_WT, + L3_LKUP(1) | L3_3_WB), + /* CCS - Non-Displayable */ + MOCS_ENTRY(15, + IG_PAT, + L3_GLBGO(1) | L3_1_UC), +}; + +static const struct xe_mocs_entry xe2_mocs_table[] = { + /* Defer to PAT */ + MOCS_ENTRY(0, XE2_L3_0_WB | L4_3_UC, 0), + /* Cached L3, Uncached L4 */ + MOCS_ENTRY(1, IG_PAT | XE2_L3_0_WB | L4_3_UC, 0), + /* Uncached L3, Cached L4 */ + MOCS_ENTRY(2, IG_PAT | XE2_L3_3_UC | L4_0_WB, 0), + /* Uncached L3 + L4 */ + MOCS_ENTRY(3, IG_PAT | XE2_L3_3_UC | L4_3_UC, 0), + /* Cached L3 + L4 */ + MOCS_ENTRY(4, IG_PAT | XE2_L3_0_WB | L4_0_WB, 0), +}; + +static unsigned int get_mocs_settings(struct xe_device *xe, + struct xe_mocs_info *info) +{ + unsigned int flags = 0; + + memset(info, 0, sizeof(struct xe_mocs_info)); + + switch (xe->info.platform) { + case XE_LUNARLAKE: + info->size = ARRAY_SIZE(xe2_mocs_table); + info->table = xe2_mocs_table; + info->n_entries = XE2_NUM_MOCS_ENTRIES; + info->uc_index = 3; + info->wb_index = 4; + info->unused_entries_index = 4; + break; + case XE_PVC: + info->size = ARRAY_SIZE(pvc_mocs_desc); + info->table = pvc_mocs_desc; + info->n_entries = PVC_NUM_MOCS_ENTRIES; + info->uc_index = 1; + info->wb_index = 2; + info->unused_entries_index = 2; + break; + case XE_METEORLAKE: + info->size = ARRAY_SIZE(mtl_mocs_desc); + info->table = mtl_mocs_desc; + info->n_entries = MTL_NUM_MOCS_ENTRIES; + info->uc_index = 9; + info->unused_entries_index = 1; + break; + case XE_DG2: + if (xe->info.subplatform == XE_SUBPLATFORM_DG2_G10 && + xe->info.step.graphics >= STEP_A0 && + xe->info.step.graphics <= STEP_B0) { + info->size = ARRAY_SIZE(dg2_mocs_desc_g10_ax); + info->table = dg2_mocs_desc_g10_ax; + } else { + info->size = ARRAY_SIZE(dg2_mocs_desc); + info->table = dg2_mocs_desc; + } + info->uc_index = 1; + info->n_entries = XELP_NUM_MOCS_ENTRIES; + info->unused_entries_index = 3; + break; + case XE_DG1: + info->size = ARRAY_SIZE(dg1_mocs_desc); + info->table = dg1_mocs_desc; + info->uc_index = 1; + info->n_entries = XELP_NUM_MOCS_ENTRIES; + info->unused_entries_index = 5; + break; + case XE_TIGERLAKE: + case XE_ROCKETLAKE: + case XE_ALDERLAKE_S: + case XE_ALDERLAKE_P: + case XE_ALDERLAKE_N: + info->size = ARRAY_SIZE(gen12_mocs_desc); + info->table = gen12_mocs_desc; + info->n_entries = XELP_NUM_MOCS_ENTRIES; + info->uc_index = 3; + info->unused_entries_index = 2; + break; + default: + drm_err(&xe->drm, "Platform that should have a MOCS table does not.\n"); + return 0; + } + + /* + * Index 0 is a reserved/unused table entry on most platforms, but + * even on those where it does represent a legitimate MOCS entry, it + * never represents the "most cached, least coherent" behavior we want + * to populate undefined table rows with. So if unused_entries_index + * is still 0 at this point, we'll assume that it was omitted by + * mistake in the switch statement above. + */ + xe_assert(xe, info->unused_entries_index != 0); + + if (XE_WARN_ON(info->size > info->n_entries)) { + info->table = NULL; + return 0; + } + + if (!IS_DGFX(xe) || GRAPHICS_VER(xe) >= 20) + flags |= HAS_GLOBAL_MOCS; + if (GRAPHICS_VER(xe) < 20) + flags |= HAS_LNCF_MOCS; + + return flags; +} + +/* + * Get control_value from MOCS entry. If the table entry is not defined, the + * settings from unused_entries_index will be returned. + */ +static u32 get_entry_control(const struct xe_mocs_info *info, + unsigned int index) +{ + if (index < info->size && info->table[index].used) + return info->table[index].control_value; + return info->table[info->unused_entries_index].control_value; +} + +static void __init_mocs_table(struct xe_gt *gt, + const struct xe_mocs_info *info) +{ + struct xe_device *xe = gt_to_xe(gt); + + unsigned int i; + u32 mocs; + + mocs_dbg(>_to_xe(gt)->drm, "entries:%d\n", info->n_entries); + drm_WARN_ONCE(&xe->drm, !info->unused_entries_index, + "Unused entries index should have been defined\n"); + for (i = 0; + i < info->n_entries ? (mocs = get_entry_control(info, i)), 1 : 0; + i++) { + mocs_dbg(>_to_xe(gt)->drm, "GLOB_MOCS[%d] 0x%x 0x%x\n", i, + XELP_GLOBAL_MOCS(i).addr, mocs); + + if (GRAPHICS_VERx100(gt_to_xe(gt)) > 1250) + xe_gt_mcr_multicast_write(gt, XEHP_GLOBAL_MOCS(i), mocs); + else + xe_mmio_write32(gt, XELP_GLOBAL_MOCS(i), mocs); + } +} + +/* + * Get l3cc_value from MOCS entry taking into account when it's not used + * then if unused_entries_index is not zero then its value will be returned + * otherwise I915_MOCS_PTE's value is returned in this case. + */ +static u16 get_entry_l3cc(const struct xe_mocs_info *info, + unsigned int index) +{ + if (index < info->size && info->table[index].used) + return info->table[index].l3cc_value; + return info->table[info->unused_entries_index].l3cc_value; +} + +static u32 l3cc_combine(u16 low, u16 high) +{ + return low | (u32)high << 16; +} + +static void init_l3cc_table(struct xe_gt *gt, + const struct xe_mocs_info *info) +{ + unsigned int i; + u32 l3cc; + + mocs_dbg(>_to_xe(gt)->drm, "entries:%d\n", info->n_entries); + for (i = 0; + i < (info->n_entries + 1) / 2 ? + (l3cc = l3cc_combine(get_entry_l3cc(info, 2 * i), + get_entry_l3cc(info, 2 * i + 1))), 1 : 0; + i++) { + mocs_dbg(>_to_xe(gt)->drm, "LNCFCMOCS[%d] 0x%x 0x%x\n", i, XELP_LNCFCMOCS(i).addr, + l3cc); + + if (GRAPHICS_VERx100(gt_to_xe(gt)) >= 1250) + xe_gt_mcr_multicast_write(gt, XEHP_LNCFCMOCS(i), l3cc); + else + xe_mmio_write32(gt, XELP_LNCFCMOCS(i), l3cc); + } +} + +void xe_mocs_init_early(struct xe_gt *gt) +{ + struct xe_mocs_info table; + + get_mocs_settings(gt_to_xe(gt), &table); + gt->mocs.uc_index = table.uc_index; + gt->mocs.wb_index = table.wb_index; +} + +void xe_mocs_init(struct xe_gt *gt) +{ + struct xe_mocs_info table; + unsigned int flags; + + /* + * MOCS settings are split between "GLOB_MOCS" and/or "LNCFCMOCS" + * registers depending on platform. + * + * These registers should be programmed before GuC initialization + * since their values will affect some of the memory transactions + * performed by the GuC. + */ + flags = get_mocs_settings(gt_to_xe(gt), &table); + mocs_dbg(>_to_xe(gt)->drm, "flag:0x%x\n", flags); + + if (flags & HAS_GLOBAL_MOCS) + __init_mocs_table(gt, &table); + if (flags & HAS_LNCF_MOCS) + init_l3cc_table(gt, &table); +} + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) +#include "tests/xe_mocs.c" +#endif diff --git a/drivers/gpu/drm/xe/xe_mocs.h b/drivers/gpu/drm/xe/xe_mocs.h new file mode 100644 index 000000000000..053754c5a94e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_mocs.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_MOCS_H_ +#define _XE_MOCS_H_ + +#include <linux/types.h> + +struct xe_exec_queue; +struct xe_gt; + +void xe_mocs_init_early(struct xe_gt *gt); +void xe_mocs_init(struct xe_gt *gt); + +#endif diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c new file mode 100644 index 000000000000..110b69864656 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_module.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_module.h" + +#include <linux/init.h> +#include <linux/module.h> + +#include "xe_drv.h" +#include "xe_hw_fence.h" +#include "xe_pci.h" +#include "xe_sched_job.h" + +struct xe_modparam xe_modparam = { + .enable_display = true, + .guc_log_level = 5, + .force_probe = CONFIG_DRM_XE_FORCE_PROBE, + /* the rest are 0 by default */ +}; + +module_param_named_unsafe(force_execlist, xe_modparam.force_execlist, bool, 0444); +MODULE_PARM_DESC(force_execlist, "Force Execlist submission"); + +module_param_named(enable_display, xe_modparam.enable_display, bool, 0444); +MODULE_PARM_DESC(enable_display, "Enable display"); + +module_param_named(vram_bar_size, xe_modparam.force_vram_bar_size, uint, 0600); +MODULE_PARM_DESC(vram_bar_size, "Set the vram bar size(in MiB)"); + +module_param_named(guc_log_level, xe_modparam.guc_log_level, int, 0600); +MODULE_PARM_DESC(guc_log_level, "GuC firmware logging level (0=disable, 1..5=enable with verbosity min..max)"); + +module_param_named_unsafe(guc_firmware_path, xe_modparam.guc_firmware_path, charp, 0400); +MODULE_PARM_DESC(guc_firmware_path, + "GuC firmware path to use instead of the default one"); + +module_param_named_unsafe(huc_firmware_path, xe_modparam.huc_firmware_path, charp, 0400); +MODULE_PARM_DESC(huc_firmware_path, + "HuC firmware path to use instead of the default one - empty string disables"); + +module_param_named_unsafe(gsc_firmware_path, xe_modparam.gsc_firmware_path, charp, 0400); +MODULE_PARM_DESC(gsc_firmware_path, + "GSC firmware path to use instead of the default one - empty string disables"); + +module_param_named_unsafe(force_probe, xe_modparam.force_probe, charp, 0400); +MODULE_PARM_DESC(force_probe, + "Force probe options for specified devices. See CONFIG_DRM_XE_FORCE_PROBE for details."); + +struct init_funcs { + int (*init)(void); + void (*exit)(void); +}; + +static const struct init_funcs init_funcs[] = { + { + .init = xe_hw_fence_module_init, + .exit = xe_hw_fence_module_exit, + }, + { + .init = xe_sched_job_module_init, + .exit = xe_sched_job_module_exit, + }, + { + .init = xe_register_pci_driver, + .exit = xe_unregister_pci_driver, + }, +}; + +static int __init xe_init(void) +{ + int err, i; + + for (i = 0; i < ARRAY_SIZE(init_funcs); i++) { + err = init_funcs[i].init(); + if (err) { + while (i--) + init_funcs[i].exit(); + return err; + } + } + + return 0; +} + +static void __exit xe_exit(void) +{ + int i; + + for (i = ARRAY_SIZE(init_funcs) - 1; i >= 0; i--) + init_funcs[i].exit(); +} + +module_init(xe_init); +module_exit(xe_exit); + +MODULE_AUTHOR("Intel Corporation"); + +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_LICENSE("GPL and additional rights"); diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h new file mode 100644 index 000000000000..88ef0e8b2bfd --- /dev/null +++ b/drivers/gpu/drm/xe/xe_module.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_MODULE_H_ +#define _XE_MODULE_H_ + +#include <linux/types.h> + +/* Module modprobe variables */ +struct xe_modparam { + bool force_execlist; + bool enable_display; + u32 force_vram_bar_size; + int guc_log_level; + char *guc_firmware_path; + char *huc_firmware_path; + char *gsc_firmware_path; + char *force_probe; +}; + +extern struct xe_modparam xe_modparam; + +#endif + diff --git a/drivers/gpu/drm/xe/xe_pat.c b/drivers/gpu/drm/xe/xe_pat.c new file mode 100644 index 000000000000..1ff6bc79e7d4 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pat.c @@ -0,0 +1,459 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_pat.h" + +#include <drm/xe_drm.h> + +#include "regs/xe_reg_defs.h" +#include "xe_assert.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_gt_mcr.h" +#include "xe_mmio.h" + +#define _PAT_ATS 0x47fc +#define _PAT_INDEX(index) _PICK_EVEN_2RANGES(index, 8, \ + 0x4800, 0x4804, \ + 0x4848, 0x484c) +#define _PAT_PTA 0x4820 + +#define XE2_NO_PROMOTE REG_BIT(10) +#define XE2_COMP_EN REG_BIT(9) +#define XE2_L3_CLOS REG_GENMASK(7, 6) +#define XE2_L3_POLICY REG_GENMASK(5, 4) +#define XE2_L4_POLICY REG_GENMASK(3, 2) +#define XE2_COH_MODE REG_GENMASK(1, 0) + +#define XELPG_L4_POLICY_MASK REG_GENMASK(3, 2) +#define XELPG_PAT_3_UC REG_FIELD_PREP(XELPG_L4_POLICY_MASK, 3) +#define XELPG_PAT_1_WT REG_FIELD_PREP(XELPG_L4_POLICY_MASK, 1) +#define XELPG_PAT_0_WB REG_FIELD_PREP(XELPG_L4_POLICY_MASK, 0) +#define XELPG_INDEX_COH_MODE_MASK REG_GENMASK(1, 0) +#define XELPG_3_COH_2W REG_FIELD_PREP(XELPG_INDEX_COH_MODE_MASK, 3) +#define XELPG_2_COH_1W REG_FIELD_PREP(XELPG_INDEX_COH_MODE_MASK, 2) +#define XELPG_0_COH_NON REG_FIELD_PREP(XELPG_INDEX_COH_MODE_MASK, 0) + +#define XEHPC_CLOS_LEVEL_MASK REG_GENMASK(3, 2) +#define XEHPC_PAT_CLOS(x) REG_FIELD_PREP(XEHPC_CLOS_LEVEL_MASK, x) + +#define XELP_MEM_TYPE_MASK REG_GENMASK(1, 0) +#define XELP_PAT_WB REG_FIELD_PREP(XELP_MEM_TYPE_MASK, 3) +#define XELP_PAT_WT REG_FIELD_PREP(XELP_MEM_TYPE_MASK, 2) +#define XELP_PAT_WC REG_FIELD_PREP(XELP_MEM_TYPE_MASK, 1) +#define XELP_PAT_UC REG_FIELD_PREP(XELP_MEM_TYPE_MASK, 0) + +static const char *XELP_MEM_TYPE_STR_MAP[] = { "UC", "WC", "WT", "WB" }; + +struct xe_pat_ops { + void (*program_graphics)(struct xe_gt *gt, const struct xe_pat_table_entry table[], + int n_entries); + void (*program_media)(struct xe_gt *gt, const struct xe_pat_table_entry table[], + int n_entries); + void (*dump)(struct xe_gt *gt, struct drm_printer *p); +}; + +static const struct xe_pat_table_entry xelp_pat_table[] = { + [0] = { XELP_PAT_WB, XE_COH_AT_LEAST_1WAY }, + [1] = { XELP_PAT_WC, XE_COH_NONE }, + [2] = { XELP_PAT_WT, XE_COH_NONE }, + [3] = { XELP_PAT_UC, XE_COH_NONE }, +}; + +static const struct xe_pat_table_entry xehpc_pat_table[] = { + [0] = { XELP_PAT_UC, XE_COH_NONE }, + [1] = { XELP_PAT_WC, XE_COH_NONE }, + [2] = { XELP_PAT_WT, XE_COH_NONE }, + [3] = { XELP_PAT_WB, XE_COH_AT_LEAST_1WAY }, + [4] = { XEHPC_PAT_CLOS(1) | XELP_PAT_WT, XE_COH_NONE }, + [5] = { XEHPC_PAT_CLOS(1) | XELP_PAT_WB, XE_COH_AT_LEAST_1WAY }, + [6] = { XEHPC_PAT_CLOS(2) | XELP_PAT_WT, XE_COH_NONE }, + [7] = { XEHPC_PAT_CLOS(2) | XELP_PAT_WB, XE_COH_AT_LEAST_1WAY }, +}; + +static const struct xe_pat_table_entry xelpg_pat_table[] = { + [0] = { XELPG_PAT_0_WB, XE_COH_NONE }, + [1] = { XELPG_PAT_1_WT, XE_COH_NONE }, + [2] = { XELPG_PAT_3_UC, XE_COH_NONE }, + [3] = { XELPG_PAT_0_WB | XELPG_2_COH_1W, XE_COH_AT_LEAST_1WAY }, + [4] = { XELPG_PAT_0_WB | XELPG_3_COH_2W, XE_COH_AT_LEAST_1WAY }, +}; + +/* + * The Xe2 table is getting large/complicated so it's easier to review if + * provided in a form that exactly matches the bspec's formatting. The meaning + * of the fields here are: + * - no_promote: 0=promotable, 1=no promote + * - comp_en: 0=disable, 1=enable + * - l3clos: L3 class of service (0-3) + * - l3_policy: 0=WB, 1=XD ("WB - Transient Display"), 3=UC + * - l4_policy: 0=WB, 1=WT, 3=UC + * - coh_mode: 0=no snoop, 2=1-way coherent, 3=2-way coherent + * + * Reserved entries should be programmed with the maximum caching, minimum + * coherency (which matches an all-0's encoding), so we can just omit them + * in the table. + */ +#define XE2_PAT(no_promote, comp_en, l3clos, l3_policy, l4_policy, __coh_mode) \ + { \ + .value = (no_promote ? XE2_NO_PROMOTE : 0) | \ + (comp_en ? XE2_COMP_EN : 0) | \ + REG_FIELD_PREP(XE2_L3_CLOS, l3clos) | \ + REG_FIELD_PREP(XE2_L3_POLICY, l3_policy) | \ + REG_FIELD_PREP(XE2_L4_POLICY, l4_policy) | \ + REG_FIELD_PREP(XE2_COH_MODE, __coh_mode), \ + .coh_mode = __coh_mode ? XE_COH_AT_LEAST_1WAY : XE_COH_NONE \ + } + +static const struct xe_pat_table_entry xe2_pat_table[] = { + [ 0] = XE2_PAT( 0, 0, 0, 0, 3, 0 ), + [ 1] = XE2_PAT( 0, 0, 0, 0, 3, 2 ), + [ 2] = XE2_PAT( 0, 0, 0, 0, 3, 3 ), + [ 3] = XE2_PAT( 0, 0, 0, 3, 3, 0 ), + [ 4] = XE2_PAT( 0, 0, 0, 3, 0, 2 ), + [ 5] = XE2_PAT( 0, 0, 0, 3, 3, 2 ), + [ 6] = XE2_PAT( 1, 0, 0, 1, 3, 0 ), + [ 7] = XE2_PAT( 0, 0, 0, 3, 0, 3 ), + [ 8] = XE2_PAT( 0, 0, 0, 3, 0, 0 ), + [ 9] = XE2_PAT( 0, 1, 0, 0, 3, 0 ), + [10] = XE2_PAT( 0, 1, 0, 3, 0, 0 ), + [11] = XE2_PAT( 1, 1, 0, 1, 3, 0 ), + [12] = XE2_PAT( 0, 1, 0, 3, 3, 0 ), + [13] = XE2_PAT( 0, 0, 0, 0, 0, 0 ), + [14] = XE2_PAT( 0, 1, 0, 0, 0, 0 ), + [15] = XE2_PAT( 1, 1, 0, 1, 1, 0 ), + /* 16..19 are reserved; leave set to all 0's */ + [20] = XE2_PAT( 0, 0, 1, 0, 3, 0 ), + [21] = XE2_PAT( 0, 1, 1, 0, 3, 0 ), + [22] = XE2_PAT( 0, 0, 1, 0, 3, 2 ), + [23] = XE2_PAT( 0, 0, 1, 0, 3, 3 ), + [24] = XE2_PAT( 0, 0, 2, 0, 3, 0 ), + [25] = XE2_PAT( 0, 1, 2, 0, 3, 0 ), + [26] = XE2_PAT( 0, 0, 2, 0, 3, 2 ), + [27] = XE2_PAT( 0, 0, 2, 0, 3, 3 ), + [28] = XE2_PAT( 0, 0, 3, 0, 3, 0 ), + [29] = XE2_PAT( 0, 1, 3, 0, 3, 0 ), + [30] = XE2_PAT( 0, 0, 3, 0, 3, 2 ), + [31] = XE2_PAT( 0, 0, 3, 0, 3, 3 ), +}; + +/* Special PAT values programmed outside the main table */ +static const struct xe_pat_table_entry xe2_pat_ats = XE2_PAT( 0, 0, 0, 0, 3, 3 ); + +u16 xe_pat_index_get_coh_mode(struct xe_device *xe, u16 pat_index) +{ + WARN_ON(pat_index >= xe->pat.n_entries); + return xe->pat.table[pat_index].coh_mode; +} + +static void program_pat(struct xe_gt *gt, const struct xe_pat_table_entry table[], + int n_entries) +{ + for (int i = 0; i < n_entries; i++) { + struct xe_reg reg = XE_REG(_PAT_INDEX(i)); + + xe_mmio_write32(gt, reg, table[i].value); + } +} + +static void program_pat_mcr(struct xe_gt *gt, const struct xe_pat_table_entry table[], + int n_entries) +{ + for (int i = 0; i < n_entries; i++) { + struct xe_reg_mcr reg_mcr = XE_REG_MCR(_PAT_INDEX(i)); + + xe_gt_mcr_multicast_write(gt, reg_mcr, table[i].value); + } +} + +static void xelp_dump(struct xe_gt *gt, struct drm_printer *p) +{ + struct xe_device *xe = gt_to_xe(gt); + int i, err; + + xe_device_mem_access_get(xe); + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + goto err_fw; + + drm_printf(p, "PAT table:\n"); + + for (i = 0; i < xe->pat.n_entries; i++) { + u32 pat = xe_mmio_read32(gt, XE_REG(_PAT_INDEX(i))); + u8 mem_type = REG_FIELD_GET(XELP_MEM_TYPE_MASK, pat); + + drm_printf(p, "PAT[%2d] = %s (%#8x)\n", i, + XELP_MEM_TYPE_STR_MAP[mem_type], pat); + } + + err = xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +err_fw: + xe_assert(xe, !err); + xe_device_mem_access_put(xe); +} + +static const struct xe_pat_ops xelp_pat_ops = { + .program_graphics = program_pat, + .dump = xelp_dump, +}; + +static void xehp_dump(struct xe_gt *gt, struct drm_printer *p) +{ + struct xe_device *xe = gt_to_xe(gt); + int i, err; + + xe_device_mem_access_get(xe); + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + goto err_fw; + + drm_printf(p, "PAT table:\n"); + + for (i = 0; i < xe->pat.n_entries; i++) { + u32 pat = xe_gt_mcr_unicast_read_any(gt, XE_REG_MCR(_PAT_INDEX(i))); + u8 mem_type; + + mem_type = REG_FIELD_GET(XELP_MEM_TYPE_MASK, pat); + + drm_printf(p, "PAT[%2d] = %s (%#8x)\n", i, + XELP_MEM_TYPE_STR_MAP[mem_type], pat); + } + + err = xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +err_fw: + xe_assert(xe, !err); + xe_device_mem_access_put(xe); +} + +static const struct xe_pat_ops xehp_pat_ops = { + .program_graphics = program_pat_mcr, + .dump = xehp_dump, +}; + +static void xehpc_dump(struct xe_gt *gt, struct drm_printer *p) +{ + struct xe_device *xe = gt_to_xe(gt); + int i, err; + + xe_device_mem_access_get(xe); + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + goto err_fw; + + drm_printf(p, "PAT table:\n"); + + for (i = 0; i < xe->pat.n_entries; i++) { + u32 pat = xe_gt_mcr_unicast_read_any(gt, XE_REG_MCR(_PAT_INDEX(i))); + + drm_printf(p, "PAT[%2d] = [ %u, %u ] (%#8x)\n", i, + REG_FIELD_GET(XELP_MEM_TYPE_MASK, pat), + REG_FIELD_GET(XEHPC_CLOS_LEVEL_MASK, pat), pat); + } + + err = xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +err_fw: + xe_assert(xe, !err); + xe_device_mem_access_put(xe); +} + +static const struct xe_pat_ops xehpc_pat_ops = { + .program_graphics = program_pat_mcr, + .dump = xehpc_dump, +}; + +static void xelpg_dump(struct xe_gt *gt, struct drm_printer *p) +{ + struct xe_device *xe = gt_to_xe(gt); + int i, err; + + xe_device_mem_access_get(xe); + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + goto err_fw; + + drm_printf(p, "PAT table:\n"); + + for (i = 0; i < xe->pat.n_entries; i++) { + u32 pat; + + if (xe_gt_is_media_type(gt)) + pat = xe_mmio_read32(gt, XE_REG(_PAT_INDEX(i))); + else + pat = xe_gt_mcr_unicast_read_any(gt, XE_REG_MCR(_PAT_INDEX(i))); + + drm_printf(p, "PAT[%2d] = [ %u, %u ] (%#8x)\n", i, + REG_FIELD_GET(XELPG_L4_POLICY_MASK, pat), + REG_FIELD_GET(XELPG_INDEX_COH_MODE_MASK, pat), pat); + } + + err = xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +err_fw: + xe_assert(xe, !err); + xe_device_mem_access_put(xe); +} + +/* + * SAMedia register offsets are adjusted by the write methods and they target + * registers that are not MCR, while for normal GT they are MCR + */ +static const struct xe_pat_ops xelpg_pat_ops = { + .program_graphics = program_pat, + .program_media = program_pat_mcr, + .dump = xelpg_dump, +}; + +static void xe2lpg_program_pat(struct xe_gt *gt, const struct xe_pat_table_entry table[], + int n_entries) +{ + program_pat_mcr(gt, table, n_entries); + xe_gt_mcr_multicast_write(gt, XE_REG_MCR(_PAT_ATS), xe2_pat_ats.value); +} + +static void xe2lpm_program_pat(struct xe_gt *gt, const struct xe_pat_table_entry table[], + int n_entries) +{ + program_pat(gt, table, n_entries); + xe_mmio_write32(gt, XE_REG(_PAT_ATS), xe2_pat_ats.value); +} + +static void xe2_dump(struct xe_gt *gt, struct drm_printer *p) +{ + struct xe_device *xe = gt_to_xe(gt); + int i, err; + u32 pat; + + xe_device_mem_access_get(xe); + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); + if (err) + goto err_fw; + + drm_printf(p, "PAT table:\n"); + + for (i = 0; i < xe->pat.n_entries; i++) { + if (xe_gt_is_media_type(gt)) + pat = xe_mmio_read32(gt, XE_REG(_PAT_INDEX(i))); + else + pat = xe_gt_mcr_unicast_read_any(gt, XE_REG_MCR(_PAT_INDEX(i))); + + drm_printf(p, "PAT[%2d] = [ %u, %u, %u, %u, %u, %u ] (%#8x)\n", i, + !!(pat & XE2_NO_PROMOTE), + !!(pat & XE2_COMP_EN), + REG_FIELD_GET(XE2_L3_CLOS, pat), + REG_FIELD_GET(XE2_L3_POLICY, pat), + REG_FIELD_GET(XE2_L4_POLICY, pat), + REG_FIELD_GET(XE2_COH_MODE, pat), + pat); + } + + /* + * Also print PTA_MODE, which describes how the hardware accesses + * PPGTT entries. + */ + if (xe_gt_is_media_type(gt)) + pat = xe_mmio_read32(gt, XE_REG(_PAT_PTA)); + else + pat = xe_gt_mcr_unicast_read_any(gt, XE_REG_MCR(_PAT_PTA)); + + drm_printf(p, "Page Table Access:\n"); + drm_printf(p, "PTA_MODE= [ %u, %u, %u, %u, %u, %u ] (%#8x)\n", + !!(pat & XE2_NO_PROMOTE), + !!(pat & XE2_COMP_EN), + REG_FIELD_GET(XE2_L3_CLOS, pat), + REG_FIELD_GET(XE2_L3_POLICY, pat), + REG_FIELD_GET(XE2_L4_POLICY, pat), + REG_FIELD_GET(XE2_COH_MODE, pat), + pat); + + err = xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); +err_fw: + xe_assert(xe, !err); + xe_device_mem_access_put(xe); +} + +static const struct xe_pat_ops xe2_pat_ops = { + .program_graphics = xe2lpg_program_pat, + .program_media = xe2lpm_program_pat, + .dump = xe2_dump, +}; + +void xe_pat_init_early(struct xe_device *xe) +{ + if (GRAPHICS_VER(xe) == 20) { + xe->pat.ops = &xe2_pat_ops; + xe->pat.table = xe2_pat_table; + xe->pat.n_entries = ARRAY_SIZE(xe2_pat_table); + xe->pat.idx[XE_CACHE_NONE] = 3; + xe->pat.idx[XE_CACHE_WT] = 15; + xe->pat.idx[XE_CACHE_WB] = 2; + xe->pat.idx[XE_CACHE_NONE_COMPRESSION] = 12; /*Applicable on xe2 and beyond */ + } else if (xe->info.platform == XE_METEORLAKE) { + xe->pat.ops = &xelpg_pat_ops; + xe->pat.table = xelpg_pat_table; + xe->pat.n_entries = ARRAY_SIZE(xelpg_pat_table); + xe->pat.idx[XE_CACHE_NONE] = 2; + xe->pat.idx[XE_CACHE_WT] = 1; + xe->pat.idx[XE_CACHE_WB] = 3; + } else if (xe->info.platform == XE_PVC) { + xe->pat.ops = &xehpc_pat_ops; + xe->pat.table = xehpc_pat_table; + xe->pat.n_entries = ARRAY_SIZE(xehpc_pat_table); + xe->pat.idx[XE_CACHE_NONE] = 0; + xe->pat.idx[XE_CACHE_WT] = 2; + xe->pat.idx[XE_CACHE_WB] = 3; + } else if (xe->info.platform == XE_DG2) { + /* + * Table is the same as previous platforms, but programming + * method has changed. + */ + xe->pat.ops = &xehp_pat_ops; + xe->pat.table = xelp_pat_table; + xe->pat.n_entries = ARRAY_SIZE(xelp_pat_table); + xe->pat.idx[XE_CACHE_NONE] = 3; + xe->pat.idx[XE_CACHE_WT] = 2; + xe->pat.idx[XE_CACHE_WB] = 0; + } else if (GRAPHICS_VERx100(xe) <= 1210) { + WARN_ON_ONCE(!IS_DGFX(xe) && !xe->info.has_llc); + xe->pat.ops = &xelp_pat_ops; + xe->pat.table = xelp_pat_table; + xe->pat.n_entries = ARRAY_SIZE(xelp_pat_table); + xe->pat.idx[XE_CACHE_NONE] = 3; + xe->pat.idx[XE_CACHE_WT] = 2; + xe->pat.idx[XE_CACHE_WB] = 0; + } else { + /* + * Going forward we expect to need new PAT settings for most + * new platforms; failure to provide a new table can easily + * lead to subtle, hard-to-debug problems. If none of the + * conditions above match the platform we're running on we'll + * raise an error rather than trying to silently inherit the + * most recent platform's behavior. + */ + drm_err(&xe->drm, "Missing PAT table for platform with graphics version %d.%02d!\n", + GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); + } +} + +void xe_pat_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + + if (!xe->pat.ops) + return; + + if (xe_gt_is_media_type(gt)) + xe->pat.ops->program_media(gt, xe->pat.table, xe->pat.n_entries); + else + xe->pat.ops->program_graphics(gt, xe->pat.table, xe->pat.n_entries); +} + +void xe_pat_dump(struct xe_gt *gt, struct drm_printer *p) +{ + struct xe_device *xe = gt_to_xe(gt); + + if (!xe->pat.ops->dump) + return; + + xe->pat.ops->dump(gt, p); +} diff --git a/drivers/gpu/drm/xe/xe_pat.h b/drivers/gpu/drm/xe/xe_pat.h new file mode 100644 index 000000000000..fa0dfbe525cd --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pat.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_PAT_H_ +#define _XE_PAT_H_ + +#include <linux/types.h> + +struct drm_printer; +struct xe_device; +struct xe_gt; + +/** + * struct xe_pat_table_entry - The pat_index encoding and other meta information. + */ +struct xe_pat_table_entry { + /** + * @value: The platform specific value encoding the various memory + * attributes (this maps to some fixed pat_index). So things like + * caching, coherency, compression etc can be encoded here. + */ + u32 value; + + /** + * @coh_mode: The GPU coherency mode that @value maps to. + */ +#define XE_COH_NONE 1 +#define XE_COH_AT_LEAST_1WAY 2 + u16 coh_mode; +}; + +/** + * xe_pat_init_early - SW initialization, setting up data based on device + * @xe: xe device + */ +void xe_pat_init_early(struct xe_device *xe); + +/** + * xe_pat_init - Program HW PAT table + * @gt: GT structure + */ +void xe_pat_init(struct xe_gt *gt); + +/** + * xe_pat_dump - Dump PAT table + * @gt: GT structure + * @p: Printer to dump info to + */ +void xe_pat_dump(struct xe_gt *gt, struct drm_printer *p); + +/** + * xe_pat_index_get_coh_mode - Extract the coherency mode for the given + * pat_index. + * @xe: xe device + * @pat_index: The pat_index to query + */ +u16 xe_pat_index_get_coh_mode(struct xe_device *xe, u16 pat_index); + +#endif diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c new file mode 100644 index 000000000000..dcc5ded1558e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -0,0 +1,951 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_pci.h" + +#include <kunit/static_stub.h> +#include <linux/device/driver.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/pm_runtime.h> + +#include <drm/drm_color_mgmt.h> +#include <drm/drm_drv.h> +#include <drm/xe_pciids.h> + +#include "regs/xe_gt_regs.h" +#include "xe_device.h" +#include "xe_display.h" +#include "xe_drv.h" +#include "xe_gt.h" +#include "xe_macros.h" +#include "xe_mmio.h" +#include "xe_module.h" +#include "xe_pci_types.h" +#include "xe_pm.h" +#include "xe_sriov.h" +#include "xe_step.h" +#include "xe_tile.h" + +enum toggle_d3cold { + D3COLD_DISABLE, + D3COLD_ENABLE, +}; + +struct xe_subplatform_desc { + enum xe_subplatform subplatform; + const char *name; + const u16 *pciidlist; +}; + +struct xe_gt_desc { + enum xe_gt_type type; + u32 mmio_adj_limit; + u32 mmio_adj_offset; +}; + +struct xe_device_desc { + /* Should only ever be set for platforms without GMD_ID */ + const struct xe_graphics_desc *graphics; + /* Should only ever be set for platforms without GMD_ID */ + const struct xe_media_desc *media; + + const char *platform_name; + const struct xe_subplatform_desc *subplatforms; + + enum xe_platform platform; + + u8 require_force_probe:1; + u8 is_dgfx:1; + + u8 has_display:1; + u8 has_heci_gscfi:1; + u8 has_llc:1; + u8 has_mmio_ext:1; + u8 has_sriov:1; + u8 skip_guc_pc:1; + u8 skip_mtcfg:1; + u8 skip_pcode:1; +}; + +__diag_push(); +__diag_ignore_all("-Woverride-init", "Allow field overrides in table"); + +#define PLATFORM(x) \ + .platform = (x), \ + .platform_name = #x + +#define NOP(x) x + +static const struct xe_graphics_desc graphics_xelp = { + .name = "Xe_LP", + .ver = 12, + .rel = 0, + + .hw_engine_mask = BIT(XE_HW_ENGINE_RCS0) | BIT(XE_HW_ENGINE_BCS0), + + .dma_mask_size = 39, + .va_bits = 48, + .vm_max_level = 3, +}; + +static const struct xe_graphics_desc graphics_xelpp = { + .name = "Xe_LP+", + .ver = 12, + .rel = 10, + + .hw_engine_mask = BIT(XE_HW_ENGINE_RCS0) | BIT(XE_HW_ENGINE_BCS0), + + .dma_mask_size = 39, + .va_bits = 48, + .vm_max_level = 3, +}; + +#define XE_HP_FEATURES \ + .has_range_tlb_invalidation = true, \ + .has_flat_ccs = true, \ + .dma_mask_size = 46, \ + .va_bits = 48, \ + .vm_max_level = 3 + +static const struct xe_graphics_desc graphics_xehpg = { + .name = "Xe_HPG", + .ver = 12, + .rel = 55, + + .hw_engine_mask = + BIT(XE_HW_ENGINE_RCS0) | BIT(XE_HW_ENGINE_BCS0) | + BIT(XE_HW_ENGINE_CCS0) | BIT(XE_HW_ENGINE_CCS1) | + BIT(XE_HW_ENGINE_CCS2) | BIT(XE_HW_ENGINE_CCS3), + + XE_HP_FEATURES, + .vram_flags = XE_VRAM_FLAGS_NEED64K, +}; + +static const struct xe_graphics_desc graphics_xehpc = { + .name = "Xe_HPC", + .ver = 12, + .rel = 60, + + .hw_engine_mask = + BIT(XE_HW_ENGINE_BCS0) | BIT(XE_HW_ENGINE_BCS1) | + BIT(XE_HW_ENGINE_BCS2) | BIT(XE_HW_ENGINE_BCS3) | + BIT(XE_HW_ENGINE_BCS4) | BIT(XE_HW_ENGINE_BCS5) | + BIT(XE_HW_ENGINE_BCS6) | BIT(XE_HW_ENGINE_BCS7) | + BIT(XE_HW_ENGINE_BCS8) | + BIT(XE_HW_ENGINE_CCS0) | BIT(XE_HW_ENGINE_CCS1) | + BIT(XE_HW_ENGINE_CCS2) | BIT(XE_HW_ENGINE_CCS3), + + XE_HP_FEATURES, + .dma_mask_size = 52, + .max_remote_tiles = 1, + .va_bits = 57, + .vm_max_level = 4, + .vram_flags = XE_VRAM_FLAGS_NEED64K, + + .has_asid = 1, + .has_flat_ccs = 0, + .has_usm = 1, +}; + +static const struct xe_graphics_desc graphics_xelpg = { + .name = "Xe_LPG", + .hw_engine_mask = + BIT(XE_HW_ENGINE_RCS0) | BIT(XE_HW_ENGINE_BCS0) | + BIT(XE_HW_ENGINE_CCS0), + + XE_HP_FEATURES, + .has_flat_ccs = 0, +}; + +#define XE2_GFX_FEATURES \ + .dma_mask_size = 46, \ + .has_asid = 1, \ + .has_flat_ccs = 1, \ + .has_range_tlb_invalidation = 1, \ + .has_usm = 0 /* FIXME: implementation missing */, \ + .va_bits = 48, \ + .vm_max_level = 4, \ + .hw_engine_mask = \ + BIT(XE_HW_ENGINE_RCS0) | \ + BIT(XE_HW_ENGINE_BCS8) | BIT(XE_HW_ENGINE_BCS0) | \ + GENMASK(XE_HW_ENGINE_CCS3, XE_HW_ENGINE_CCS0) + +static const struct xe_graphics_desc graphics_xe2 = { + .name = "Xe2_LPG", + + XE2_GFX_FEATURES, +}; + +static const struct xe_media_desc media_xem = { + .name = "Xe_M", + .ver = 12, + .rel = 0, + + .hw_engine_mask = + BIT(XE_HW_ENGINE_VCS0) | BIT(XE_HW_ENGINE_VCS2) | + BIT(XE_HW_ENGINE_VECS0), +}; + +static const struct xe_media_desc media_xehpm = { + .name = "Xe_HPM", + .ver = 12, + .rel = 55, + + .hw_engine_mask = + BIT(XE_HW_ENGINE_VCS0) | BIT(XE_HW_ENGINE_VCS2) | + BIT(XE_HW_ENGINE_VECS0) | BIT(XE_HW_ENGINE_VECS1), +}; + +static const struct xe_media_desc media_xelpmp = { + .name = "Xe_LPM+", + .hw_engine_mask = + BIT(XE_HW_ENGINE_VCS0) | BIT(XE_HW_ENGINE_VCS2) | + BIT(XE_HW_ENGINE_VECS0) | BIT(XE_HW_ENGINE_GSCCS0) +}; + +static const struct xe_media_desc media_xe2 = { + .name = "Xe2_LPM", + .hw_engine_mask = + BIT(XE_HW_ENGINE_VCS0) | BIT(XE_HW_ENGINE_VECS0), /* TODO: GSC0 */ +}; + +static const struct xe_device_desc tgl_desc = { + .graphics = &graphics_xelp, + .media = &media_xem, + PLATFORM(XE_TIGERLAKE), + .has_display = true, + .has_llc = true, + .require_force_probe = true, +}; + +static const struct xe_device_desc rkl_desc = { + .graphics = &graphics_xelp, + .media = &media_xem, + PLATFORM(XE_ROCKETLAKE), + .has_display = true, + .has_llc = true, + .require_force_probe = true, +}; + +static const u16 adls_rpls_ids[] = { XE_RPLS_IDS(NOP), 0 }; + +static const struct xe_device_desc adl_s_desc = { + .graphics = &graphics_xelp, + .media = &media_xem, + PLATFORM(XE_ALDERLAKE_S), + .has_display = true, + .has_llc = true, + .require_force_probe = true, + .subplatforms = (const struct xe_subplatform_desc[]) { + { XE_SUBPLATFORM_ALDERLAKE_S_RPLS, "RPLS", adls_rpls_ids }, + {}, + }, +}; + +static const u16 adlp_rplu_ids[] = { XE_RPLU_IDS(NOP), 0 }; + +static const struct xe_device_desc adl_p_desc = { + .graphics = &graphics_xelp, + .media = &media_xem, + PLATFORM(XE_ALDERLAKE_P), + .has_display = true, + .has_llc = true, + .require_force_probe = true, + .subplatforms = (const struct xe_subplatform_desc[]) { + { XE_SUBPLATFORM_ALDERLAKE_P_RPLU, "RPLU", adlp_rplu_ids }, + {}, + }, +}; + +static const struct xe_device_desc adl_n_desc = { + .graphics = &graphics_xelp, + .media = &media_xem, + PLATFORM(XE_ALDERLAKE_N), + .has_display = true, + .has_llc = true, + .require_force_probe = true, +}; + +#define DGFX_FEATURES \ + .is_dgfx = 1 + +static const struct xe_device_desc dg1_desc = { + .graphics = &graphics_xelpp, + .media = &media_xem, + DGFX_FEATURES, + PLATFORM(XE_DG1), + .has_display = true, + .has_heci_gscfi = 1, + .require_force_probe = true, +}; + +static const u16 dg2_g10_ids[] = { XE_DG2_G10_IDS(NOP), XE_ATS_M150_IDS(NOP), 0 }; +static const u16 dg2_g11_ids[] = { XE_DG2_G11_IDS(NOP), XE_ATS_M75_IDS(NOP), 0 }; +static const u16 dg2_g12_ids[] = { XE_DG2_G12_IDS(NOP), 0 }; + +#define DG2_FEATURES \ + DGFX_FEATURES, \ + PLATFORM(XE_DG2), \ + .has_heci_gscfi = 1, \ + .subplatforms = (const struct xe_subplatform_desc[]) { \ + { XE_SUBPLATFORM_DG2_G10, "G10", dg2_g10_ids }, \ + { XE_SUBPLATFORM_DG2_G11, "G11", dg2_g11_ids }, \ + { XE_SUBPLATFORM_DG2_G12, "G12", dg2_g12_ids }, \ + { } \ + } + +static const struct xe_device_desc ats_m_desc = { + .graphics = &graphics_xehpg, + .media = &media_xehpm, + .require_force_probe = true, + + DG2_FEATURES, + .has_display = false, +}; + +static const struct xe_device_desc dg2_desc = { + .graphics = &graphics_xehpg, + .media = &media_xehpm, + .require_force_probe = true, + + DG2_FEATURES, + .has_display = true, +}; + +static const __maybe_unused struct xe_device_desc pvc_desc = { + .graphics = &graphics_xehpc, + DGFX_FEATURES, + PLATFORM(XE_PVC), + .has_display = false, + .has_heci_gscfi = 1, + .require_force_probe = true, +}; + +static const struct xe_device_desc mtl_desc = { + /* .graphics and .media determined via GMD_ID */ + .require_force_probe = true, + PLATFORM(XE_METEORLAKE), + .has_display = true, +}; + +static const struct xe_device_desc lnl_desc = { + PLATFORM(XE_LUNARLAKE), + .require_force_probe = true, +}; + +#undef PLATFORM +__diag_pop(); + +/* Map of GMD_ID values to graphics IP */ +static struct gmdid_map graphics_ip_map[] = { + { 1270, &graphics_xelpg }, + { 1271, &graphics_xelpg }, + { 2004, &graphics_xe2 }, +}; + +/* Map of GMD_ID values to media IP */ +static struct gmdid_map media_ip_map[] = { + { 1300, &media_xelpmp }, + { 2000, &media_xe2 }, +}; + +#define INTEL_VGA_DEVICE(id, info) { \ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, id), \ + PCI_BASE_CLASS_DISPLAY << 16, 0xff << 16, \ + (unsigned long) info } + +/* + * Make sure any device matches here are from most specific to most + * general. For example, since the Quanta match is based on the subsystem + * and subvendor IDs, we need it to come before the more general IVB + * PCI ID matches, otherwise we'll use the wrong info struct above. + */ +static const struct pci_device_id pciidlist[] = { + XE_TGL_IDS(INTEL_VGA_DEVICE, &tgl_desc), + XE_RKL_IDS(INTEL_VGA_DEVICE, &rkl_desc), + XE_ADLS_IDS(INTEL_VGA_DEVICE, &adl_s_desc), + XE_ADLP_IDS(INTEL_VGA_DEVICE, &adl_p_desc), + XE_ADLN_IDS(INTEL_VGA_DEVICE, &adl_n_desc), + XE_RPLP_IDS(INTEL_VGA_DEVICE, &adl_p_desc), + XE_RPLS_IDS(INTEL_VGA_DEVICE, &adl_s_desc), + XE_DG1_IDS(INTEL_VGA_DEVICE, &dg1_desc), + XE_ATS_M_IDS(INTEL_VGA_DEVICE, &ats_m_desc), + XE_DG2_IDS(INTEL_VGA_DEVICE, &dg2_desc), + XE_MTL_IDS(INTEL_VGA_DEVICE, &mtl_desc), + XE_LNL_IDS(INTEL_VGA_DEVICE, &lnl_desc), + { } +}; +MODULE_DEVICE_TABLE(pci, pciidlist); + +#undef INTEL_VGA_DEVICE + +/* is device_id present in comma separated list of ids */ +static bool device_id_in_list(u16 device_id, const char *devices, bool negative) +{ + char *s, *p, *tok; + bool ret; + + if (!devices || !*devices) + return false; + + /* match everything */ + if (negative && strcmp(devices, "!*") == 0) + return true; + if (!negative && strcmp(devices, "*") == 0) + return true; + + s = kstrdup(devices, GFP_KERNEL); + if (!s) + return false; + + for (p = s, ret = false; (tok = strsep(&p, ",")) != NULL; ) { + u16 val; + + if (negative && tok[0] == '!') + tok++; + else if ((negative && tok[0] != '!') || + (!negative && tok[0] == '!')) + continue; + + if (kstrtou16(tok, 16, &val) == 0 && val == device_id) { + ret = true; + break; + } + } + + kfree(s); + + return ret; +} + +static bool id_forced(u16 device_id) +{ + return device_id_in_list(device_id, xe_modparam.force_probe, false); +} + +static bool id_blocked(u16 device_id) +{ + return device_id_in_list(device_id, xe_modparam.force_probe, true); +} + +static const struct xe_subplatform_desc * +find_subplatform(const struct xe_device *xe, const struct xe_device_desc *desc) +{ + const struct xe_subplatform_desc *sp; + const u16 *id; + + for (sp = desc->subplatforms; sp && sp->subplatform; sp++) + for (id = sp->pciidlist; *id; id++) + if (*id == xe->info.devid) + return sp; + + return NULL; +} + +enum xe_gmdid_type { + GMDID_GRAPHICS, + GMDID_MEDIA +}; + +static void read_gmdid(struct xe_device *xe, enum xe_gmdid_type type, u32 *ver, u32 *revid) +{ + struct xe_gt *gt = xe_root_mmio_gt(xe); + struct xe_reg gmdid_reg = GMD_ID; + u32 val; + + KUNIT_STATIC_STUB_REDIRECT(read_gmdid, xe, type, ver, revid); + + if (type == GMDID_MEDIA) + gmdid_reg.addr += MEDIA_GT_GSI_OFFSET; + + val = xe_mmio_read32(gt, gmdid_reg); + *ver = REG_FIELD_GET(GMD_ID_ARCH_MASK, val) * 100 + REG_FIELD_GET(GMD_ID_RELEASE_MASK, val); + *revid = REG_FIELD_GET(GMD_ID_REVID, val); +} + +/* + * Pre-GMD_ID platform: device descriptor already points to the appropriate + * graphics descriptor. Simply forward the description and calculate the version + * appropriately. "graphics" should be present in all such platforms, while + * media is optional. + */ +static void handle_pre_gmdid(struct xe_device *xe, + const struct xe_graphics_desc *graphics, + const struct xe_media_desc *media) +{ + xe->info.graphics_verx100 = graphics->ver * 100 + graphics->rel; + + if (media) + xe->info.media_verx100 = media->ver * 100 + media->rel; + +} + +/* + * GMD_ID platform: read IP version from hardware and select graphics descriptor + * based on the result. + */ +static void handle_gmdid(struct xe_device *xe, + const struct xe_graphics_desc **graphics, + const struct xe_media_desc **media, + u32 *graphics_revid, + u32 *media_revid) +{ + u32 ver; + + read_gmdid(xe, GMDID_GRAPHICS, &ver, graphics_revid); + + for (int i = 0; i < ARRAY_SIZE(graphics_ip_map); i++) { + if (ver == graphics_ip_map[i].ver) { + xe->info.graphics_verx100 = ver; + *graphics = graphics_ip_map[i].ip; + + break; + } + } + + if (!xe->info.graphics_verx100) { + drm_err(&xe->drm, "Hardware reports unknown graphics version %u.%02u\n", + ver / 100, ver % 100); + } + + read_gmdid(xe, GMDID_MEDIA, &ver, media_revid); + + /* Media may legitimately be fused off / not present */ + if (ver == 0) + return; + + for (int i = 0; i < ARRAY_SIZE(media_ip_map); i++) { + if (ver == media_ip_map[i].ver) { + xe->info.media_verx100 = ver; + *media = media_ip_map[i].ip; + + break; + } + } + + if (!xe->info.media_verx100) { + drm_err(&xe->drm, "Hardware reports unknown media version %u.%02u\n", + ver / 100, ver % 100); + } +} + +/* + * Initialize device info content that only depends on static driver_data + * passed to the driver at probe time from PCI ID table. + */ +static int xe_info_init_early(struct xe_device *xe, + const struct xe_device_desc *desc, + const struct xe_subplatform_desc *subplatform_desc) +{ + int err; + + xe->info.platform = desc->platform; + xe->info.subplatform = subplatform_desc ? + subplatform_desc->subplatform : XE_SUBPLATFORM_NONE; + + xe->info.is_dgfx = desc->is_dgfx; + xe->info.has_heci_gscfi = desc->has_heci_gscfi; + xe->info.has_llc = desc->has_llc; + xe->info.has_mmio_ext = desc->has_mmio_ext; + xe->info.has_sriov = desc->has_sriov; + xe->info.skip_guc_pc = desc->skip_guc_pc; + xe->info.skip_mtcfg = desc->skip_mtcfg; + xe->info.skip_pcode = desc->skip_pcode; + + xe->info.enable_display = IS_ENABLED(CONFIG_DRM_XE_DISPLAY) && + xe_modparam.enable_display && + desc->has_display; + + err = xe_tile_init_early(xe_device_get_root_tile(xe), xe, 0); + if (err) + return err; + + return 0; +} + +/* + * Initialize device info content that does require knowledge about + * graphics / media IP version. + * Make sure that GT / tile structures allocated by the driver match the data + * present in device info. + */ +static int xe_info_init(struct xe_device *xe, + const struct xe_graphics_desc *graphics_desc, + const struct xe_media_desc *media_desc) +{ + u32 graphics_gmdid_revid = 0, media_gmdid_revid = 0; + struct xe_tile *tile; + struct xe_gt *gt; + u8 id; + + /* + * If this platform supports GMD_ID, we'll detect the proper IP + * descriptor to use from hardware registers. desc->graphics will only + * ever be set at this point for platforms before GMD_ID. In that case + * the IP descriptions and versions are simply derived from that. + */ + if (graphics_desc) { + handle_pre_gmdid(xe, graphics_desc, media_desc); + xe->info.step = xe_step_pre_gmdid_get(xe); + } else { + xe_assert(xe, !media_desc); + handle_gmdid(xe, &graphics_desc, &media_desc, + &graphics_gmdid_revid, &media_gmdid_revid); + xe->info.step = xe_step_gmdid_get(xe, + graphics_gmdid_revid, + media_gmdid_revid); + } + + /* + * If we couldn't detect the graphics IP, that's considered a fatal + * error and we should abort driver load. Failing to detect media + * IP is non-fatal; we'll just proceed without enabling media support. + */ + if (!graphics_desc) + return -ENODEV; + + xe->info.graphics_name = graphics_desc->name; + xe->info.media_name = media_desc ? media_desc->name : "none"; + xe->info.tile_mmio_ext_size = graphics_desc->tile_mmio_ext_size; + + xe->info.dma_mask_size = graphics_desc->dma_mask_size; + xe->info.vram_flags = graphics_desc->vram_flags; + xe->info.va_bits = graphics_desc->va_bits; + xe->info.vm_max_level = graphics_desc->vm_max_level; + xe->info.has_asid = graphics_desc->has_asid; + xe->info.has_flat_ccs = graphics_desc->has_flat_ccs; + xe->info.has_range_tlb_invalidation = graphics_desc->has_range_tlb_invalidation; + xe->info.has_usm = graphics_desc->has_usm; + + /* + * All platforms have at least one primary GT. Any platform with media + * version 13 or higher has an additional dedicated media GT. And + * depending on the graphics IP there may be additional "remote tiles." + * All of these together determine the overall GT count. + * + * FIXME: 'tile_count' here is misnamed since the rest of the driver + * treats it as the number of GTs rather than just the number of tiles. + */ + xe->info.tile_count = 1 + graphics_desc->max_remote_tiles; + + for_each_remote_tile(tile, xe, id) { + int err; + + err = xe_tile_init_early(tile, xe, id); + if (err) + return err; + } + + for_each_tile(tile, xe, id) { + gt = tile->primary_gt; + gt->info.id = xe->info.gt_count++; + gt->info.type = XE_GT_TYPE_MAIN; + gt->info.__engine_mask = graphics_desc->hw_engine_mask; + if (MEDIA_VER(xe) < 13 && media_desc) + gt->info.__engine_mask |= media_desc->hw_engine_mask; + + if (MEDIA_VER(xe) < 13 || !media_desc) + continue; + + /* + * Allocate and setup media GT for platforms with standalone + * media. + */ + tile->media_gt = xe_gt_alloc(tile); + if (IS_ERR(tile->media_gt)) + return PTR_ERR(tile->media_gt); + + gt = tile->media_gt; + gt->info.type = XE_GT_TYPE_MEDIA; + gt->info.__engine_mask = media_desc->hw_engine_mask; + gt->mmio.adj_offset = MEDIA_GT_GSI_OFFSET; + gt->mmio.adj_limit = MEDIA_GT_GSI_LENGTH; + + /* + * FIXME: At the moment multi-tile and standalone media are + * mutually exclusive on current platforms. We'll need to + * come up with a better way to number GTs if we ever wind + * up with platforms that support both together. + */ + drm_WARN_ON(&xe->drm, id != 0); + gt->info.id = xe->info.gt_count++; + } + + return 0; +} + +static void xe_pci_remove(struct pci_dev *pdev) +{ + struct xe_device *xe; + + xe = pci_get_drvdata(pdev); + if (!xe) /* driver load aborted, nothing to cleanup */ + return; + + xe_device_remove(xe); + xe_pm_runtime_fini(xe); + pci_set_drvdata(pdev, NULL); +} + +static int xe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + const struct xe_device_desc *desc = (const void *)ent->driver_data; + const struct xe_subplatform_desc *subplatform_desc; + struct xe_device *xe; + int err; + + if (desc->require_force_probe && !id_forced(pdev->device)) { + dev_info(&pdev->dev, + "Your graphics device %04x is not officially supported\n" + "by xe driver in this kernel version. To force Xe probe,\n" + "use xe.force_probe='%04x' and i915.force_probe='!%04x'\n" + "module parameters or CONFIG_DRM_XE_FORCE_PROBE='%04x' and\n" + "CONFIG_DRM_I915_FORCE_PROBE='!%04x' configuration options.\n", + pdev->device, pdev->device, pdev->device, + pdev->device, pdev->device); + return -ENODEV; + } + + if (id_blocked(pdev->device)) { + dev_info(&pdev->dev, "Probe blocked for device [%04x:%04x].\n", + pdev->vendor, pdev->device); + return -ENODEV; + } + + if (xe_display_driver_probe_defer(pdev)) + return -EPROBE_DEFER; + + err = pcim_enable_device(pdev); + if (err) + return err; + + xe = xe_device_create(pdev, ent); + if (IS_ERR(xe)) + return PTR_ERR(xe); + + pci_set_drvdata(pdev, xe); + + xe_pm_assert_unbounded_bridge(xe); + subplatform_desc = find_subplatform(xe, desc); + + pci_set_master(pdev); + + err = xe_info_init_early(xe, desc, subplatform_desc); + if (err) + return err; + + xe_sriov_probe_early(xe, desc->has_sriov); + + err = xe_device_probe_early(xe); + if (err) + return err; + + err = xe_info_init(xe, desc->graphics, desc->media); + if (err) + return err; + + xe_display_probe(xe); + + drm_dbg(&xe->drm, "%s %s %04x:%04x dgfx:%d gfx:%s (%d.%02d) media:%s (%d.%02d) display:%s dma_m_s:%d tc:%d gscfi:%d", + desc->platform_name, + subplatform_desc ? subplatform_desc->name : "", + xe->info.devid, xe->info.revid, + xe->info.is_dgfx, + xe->info.graphics_name, + xe->info.graphics_verx100 / 100, + xe->info.graphics_verx100 % 100, + xe->info.media_name, + xe->info.media_verx100 / 100, + xe->info.media_verx100 % 100, + str_yes_no(xe->info.enable_display), + xe->info.dma_mask_size, xe->info.tile_count, + xe->info.has_heci_gscfi); + + drm_dbg(&xe->drm, "Stepping = (G:%s, M:%s, D:%s, B:%s)\n", + xe_step_name(xe->info.step.graphics), + xe_step_name(xe->info.step.media), + xe_step_name(xe->info.step.display), + xe_step_name(xe->info.step.basedie)); + + drm_dbg(&xe->drm, "SR-IOV support: %s (mode: %s)\n", + str_yes_no(xe_device_has_sriov(xe)), + xe_sriov_mode_to_string(xe_device_sriov_mode(xe))); + + err = xe_device_probe(xe); + if (err) + return err; + + xe_pm_init(xe); + + drm_dbg(&xe->drm, "d3cold: capable=%s\n", + str_yes_no(xe->d3cold.capable)); + + return 0; +} + +static void xe_pci_shutdown(struct pci_dev *pdev) +{ + xe_device_shutdown(pdev_to_xe_device(pdev)); +} + +#ifdef CONFIG_PM_SLEEP +static void d3cold_toggle(struct pci_dev *pdev, enum toggle_d3cold toggle) +{ + struct xe_device *xe = pdev_to_xe_device(pdev); + struct pci_dev *root_pdev; + + if (!xe->d3cold.capable) + return; + + root_pdev = pcie_find_root_port(pdev); + if (!root_pdev) + return; + + switch (toggle) { + case D3COLD_DISABLE: + pci_d3cold_disable(root_pdev); + break; + case D3COLD_ENABLE: + pci_d3cold_enable(root_pdev); + break; + } +} + +static int xe_pci_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int err; + + err = xe_pm_suspend(pdev_to_xe_device(pdev)); + if (err) + return err; + + /* + * Enabling D3Cold is needed for S2Idle/S0ix. + * It is save to allow here since xe_pm_suspend has evicted + * the local memory and the direct complete optimization is disabled. + */ + d3cold_toggle(pdev, D3COLD_ENABLE); + + pci_save_state(pdev); + pci_disable_device(pdev); + + return 0; +} + +static int xe_pci_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int err; + + /* Give back the D3Cold decision to the runtime P M*/ + d3cold_toggle(pdev, D3COLD_DISABLE); + + err = pci_set_power_state(pdev, PCI_D0); + if (err) + return err; + + err = pci_enable_device(pdev); + if (err) + return err; + + pci_set_master(pdev); + + err = xe_pm_resume(pdev_to_xe_device(pdev)); + if (err) + return err; + + return 0; +} + +static int xe_pci_runtime_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct xe_device *xe = pdev_to_xe_device(pdev); + int err; + + err = xe_pm_runtime_suspend(xe); + if (err) + return err; + + pci_save_state(pdev); + + if (xe->d3cold.allowed) { + d3cold_toggle(pdev, D3COLD_ENABLE); + pci_disable_device(pdev); + pci_ignore_hotplug(pdev); + pci_set_power_state(pdev, PCI_D3cold); + } else { + d3cold_toggle(pdev, D3COLD_DISABLE); + pci_set_power_state(pdev, PCI_D3hot); + } + + return 0; +} + +static int xe_pci_runtime_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct xe_device *xe = pdev_to_xe_device(pdev); + int err; + + err = pci_set_power_state(pdev, PCI_D0); + if (err) + return err; + + pci_restore_state(pdev); + + if (xe->d3cold.allowed) { + err = pci_enable_device(pdev); + if (err) + return err; + + pci_set_master(pdev); + } + + return xe_pm_runtime_resume(xe); +} + +static int xe_pci_runtime_idle(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct xe_device *xe = pdev_to_xe_device(pdev); + + xe_pm_d3cold_allowed_toggle(xe); + + return 0; +} + +static const struct dev_pm_ops xe_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(xe_pci_suspend, xe_pci_resume) + SET_RUNTIME_PM_OPS(xe_pci_runtime_suspend, xe_pci_runtime_resume, xe_pci_runtime_idle) +}; +#endif + +static struct pci_driver xe_pci_driver = { + .name = DRIVER_NAME, + .id_table = pciidlist, + .probe = xe_pci_probe, + .remove = xe_pci_remove, + .shutdown = xe_pci_shutdown, +#ifdef CONFIG_PM_SLEEP + .driver.pm = &xe_pm_ops, +#endif +}; + +int xe_register_pci_driver(void) +{ + return pci_register_driver(&xe_pci_driver); +} + +void xe_unregister_pci_driver(void) +{ + pci_unregister_driver(&xe_pci_driver); +} + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) +#include "tests/xe_pci.c" +#endif diff --git a/drivers/gpu/drm/xe/xe_pci.h b/drivers/gpu/drm/xe/xe_pci.h new file mode 100644 index 000000000000..611c1209b14c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pci.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_PCI_H_ +#define _XE_PCI_H_ + +int xe_register_pci_driver(void); +void xe_unregister_pci_driver(void); + +#endif diff --git a/drivers/gpu/drm/xe/xe_pci_types.h b/drivers/gpu/drm/xe/xe_pci_types.h new file mode 100644 index 000000000000..b1ad12fa22d6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pci_types.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_PCI_TYPES_H_ +#define _XE_PCI_TYPES_H_ + +#include <linux/types.h> + +struct xe_graphics_desc { + const char *name; + u8 ver; + u8 rel; + + u8 dma_mask_size; /* available DMA address bits */ + u8 va_bits; + u8 vm_max_level; + u8 vram_flags; + + u64 hw_engine_mask; /* hardware engines provided by graphics IP */ + + u32 tile_mmio_ext_size; /* size of MMIO extension space, per-tile */ + + u8 max_remote_tiles:2; + + u8 has_asid:1; + u8 has_flat_ccs:1; + u8 has_range_tlb_invalidation:1; + u8 has_usm:1; +}; + +struct xe_media_desc { + const char *name; + u8 ver; + u8 rel; + + u64 hw_engine_mask; /* hardware engines provided by media IP */ +}; + +struct gmdid_map { + unsigned int ver; + const void *ip; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_pcode.c b/drivers/gpu/drm/xe/xe_pcode.c new file mode 100644 index 000000000000..b324dc2a5deb --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pcode.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_pcode.h" + +#include <linux/delay.h> +#include <linux/errno.h> + +#include <drm/drm_managed.h> + +#include "xe_gt.h" +#include "xe_mmio.h" +#include "xe_pcode_api.h" + +/** + * DOC: PCODE + * + * Xe PCODE is the component responsible for interfacing with the PCODE + * firmware. + * It shall provide a very simple ABI to other Xe components, but be the + * single and consolidated place that will communicate with PCODE. All read + * and write operations to PCODE will be internal and private to this component. + * + * What's next: + * - PCODE hw metrics + * - PCODE for display operations + */ + +static int pcode_mailbox_status(struct xe_gt *gt) +{ + u32 err; + static const struct pcode_err_decode err_decode[] = { + [PCODE_ILLEGAL_CMD] = {-ENXIO, "Illegal Command"}, + [PCODE_TIMEOUT] = {-ETIMEDOUT, "Timed out"}, + [PCODE_ILLEGAL_DATA] = {-EINVAL, "Illegal Data"}, + [PCODE_ILLEGAL_SUBCOMMAND] = {-ENXIO, "Illegal Subcommand"}, + [PCODE_LOCKED] = {-EBUSY, "PCODE Locked"}, + [PCODE_GT_RATIO_OUT_OF_RANGE] = {-EOVERFLOW, + "GT ratio out of range"}, + [PCODE_REJECTED] = {-EACCES, "PCODE Rejected"}, + [PCODE_ERROR_MASK] = {-EPROTO, "Unknown"}, + }; + + lockdep_assert_held(>->pcode.lock); + + err = xe_mmio_read32(gt, PCODE_MAILBOX) & PCODE_ERROR_MASK; + if (err) { + drm_err(>_to_xe(gt)->drm, "PCODE Mailbox failed: %d %s", err, + err_decode[err].str ?: "Unknown"); + return err_decode[err].errno ?: -EPROTO; + } + + return 0; +} + +static int pcode_mailbox_rw(struct xe_gt *gt, u32 mbox, u32 *data0, u32 *data1, + unsigned int timeout_ms, bool return_data, + bool atomic) +{ + int err; + + if (gt_to_xe(gt)->info.skip_pcode) + return 0; + + lockdep_assert_held(>->pcode.lock); + + if ((xe_mmio_read32(gt, PCODE_MAILBOX) & PCODE_READY) != 0) + return -EAGAIN; + + xe_mmio_write32(gt, PCODE_DATA0, *data0); + xe_mmio_write32(gt, PCODE_DATA1, data1 ? *data1 : 0); + xe_mmio_write32(gt, PCODE_MAILBOX, PCODE_READY | mbox); + + err = xe_mmio_wait32(gt, PCODE_MAILBOX, PCODE_READY, 0, + timeout_ms * 1000, NULL, atomic); + if (err) + return err; + + if (return_data) { + *data0 = xe_mmio_read32(gt, PCODE_DATA0); + if (data1) + *data1 = xe_mmio_read32(gt, PCODE_DATA1); + } + + return pcode_mailbox_status(gt); +} + +int xe_pcode_write_timeout(struct xe_gt *gt, u32 mbox, u32 data, int timeout) +{ + int err; + + mutex_lock(>->pcode.lock); + err = pcode_mailbox_rw(gt, mbox, &data, NULL, timeout, false, false); + mutex_unlock(>->pcode.lock); + + return err; +} + +int xe_pcode_read(struct xe_gt *gt, u32 mbox, u32 *val, u32 *val1) +{ + int err; + + mutex_lock(>->pcode.lock); + err = pcode_mailbox_rw(gt, mbox, val, val1, 1, true, false); + mutex_unlock(>->pcode.lock); + + return err; +} + +static int xe_pcode_try_request(struct xe_gt *gt, u32 mbox, + u32 request, u32 reply_mask, u32 reply, + u32 *status, bool atomic, int timeout_us) +{ + int slept, wait = 10; + + for (slept = 0; slept < timeout_us; slept += wait) { + *status = pcode_mailbox_rw(gt, mbox, &request, NULL, 1, true, + atomic); + if ((*status == 0) && ((request & reply_mask) == reply)) + return 0; + + if (atomic) + udelay(wait); + else + usleep_range(wait, wait << 1); + wait <<= 1; + } + + return -ETIMEDOUT; +} + +/** + * xe_pcode_request - send PCODE request until acknowledgment + * @gt: gt + * @mbox: PCODE mailbox ID the request is targeted for + * @request: request ID + * @reply_mask: mask used to check for request acknowledgment + * @reply: value used to check for request acknowledgment + * @timeout_base_ms: timeout for polling with preemption enabled + * + * Keep resending the @request to @mbox until PCODE acknowledges it, PCODE + * reports an error or an overall timeout of @timeout_base_ms+50 ms expires. + * The request is acknowledged once the PCODE reply dword equals @reply after + * applying @reply_mask. Polling is first attempted with preemption enabled + * for @timeout_base_ms and if this times out for another 50 ms with + * preemption disabled. + * + * Returns 0 on success, %-ETIMEDOUT in case of a timeout, <0 in case of some + * other error as reported by PCODE. + */ +int xe_pcode_request(struct xe_gt *gt, u32 mbox, u32 request, + u32 reply_mask, u32 reply, int timeout_base_ms) +{ + u32 status; + int ret; + + mutex_lock(>->pcode.lock); + + ret = xe_pcode_try_request(gt, mbox, request, reply_mask, reply, &status, + false, timeout_base_ms * 1000); + if (!ret) + goto out; + + /* + * The above can time out if the number of requests was low (2 in the + * worst case) _and_ PCODE was busy for some reason even after a + * (queued) request and @timeout_base_ms delay. As a workaround retry + * the poll with preemption disabled to maximize the number of + * requests. Increase the timeout from @timeout_base_ms to 50ms to + * account for interrupts that could reduce the number of these + * requests, and for any quirks of the PCODE firmware that delays + * the request completion. + */ + drm_err(>_to_xe(gt)->drm, + "PCODE timeout, retrying with preemption disabled\n"); + drm_WARN_ON_ONCE(>_to_xe(gt)->drm, timeout_base_ms > 1); + preempt_disable(); + ret = xe_pcode_try_request(gt, mbox, request, reply_mask, reply, &status, + true, timeout_base_ms * 1000); + preempt_enable(); + +out: + mutex_unlock(>->pcode.lock); + return status ? status : ret; +} +/** + * xe_pcode_init_min_freq_table - Initialize PCODE's QOS frequency table + * @gt: gt instance + * @min_gt_freq: Minimal (RPn) GT frequency in units of 50MHz. + * @max_gt_freq: Maximal (RP0) GT frequency in units of 50MHz. + * + * This function initialize PCODE's QOS frequency table for a proper minimal + * frequency/power steering decision, depending on the current requested GT + * frequency. For older platforms this was a more complete table including + * the IA freq. However for the latest platforms this table become a simple + * 1-1 Ring vs GT frequency. Even though, without setting it, PCODE might + * not take the right decisions for some memory frequencies and affect latency. + * + * It returns 0 on success, and -ERROR number on failure, -EINVAL if max + * frequency is higher then the minimal, and other errors directly translated + * from the PCODE Error returs: + * - -ENXIO: "Illegal Command" + * - -ETIMEDOUT: "Timed out" + * - -EINVAL: "Illegal Data" + * - -ENXIO, "Illegal Subcommand" + * - -EBUSY: "PCODE Locked" + * - -EOVERFLOW, "GT ratio out of range" + * - -EACCES, "PCODE Rejected" + * - -EPROTO, "Unknown" + */ +int xe_pcode_init_min_freq_table(struct xe_gt *gt, u32 min_gt_freq, + u32 max_gt_freq) +{ + int ret; + u32 freq; + + if (!gt_to_xe(gt)->info.has_llc) + return 0; + + if (max_gt_freq <= min_gt_freq) + return -EINVAL; + + mutex_lock(>->pcode.lock); + for (freq = min_gt_freq; freq <= max_gt_freq; freq++) { + u32 data = freq << PCODE_FREQ_RING_RATIO_SHIFT | freq; + + ret = pcode_mailbox_rw(gt, PCODE_WRITE_MIN_FREQ_TABLE, + &data, NULL, 1, false, false); + if (ret) + goto unlock; + } + +unlock: + mutex_unlock(>->pcode.lock); + return ret; +} + +/** + * xe_pcode_init - Ensure PCODE is initialized + * @gt: gt instance + * + * This function ensures that PCODE is properly initialized. To be called during + * probe and resume paths. + * + * It returns 0 on success, and -error number on failure. + */ +int xe_pcode_init(struct xe_gt *gt) +{ + u32 status, request = DGFX_GET_INIT_STATUS; + int timeout_us = 180000000; /* 3 min */ + int ret; + + if (gt_to_xe(gt)->info.skip_pcode) + return 0; + + if (!IS_DGFX(gt_to_xe(gt))) + return 0; + + mutex_lock(>->pcode.lock); + ret = xe_pcode_try_request(gt, DGFX_PCODE_STATUS, request, + DGFX_INIT_STATUS_COMPLETE, + DGFX_INIT_STATUS_COMPLETE, + &status, false, timeout_us); + mutex_unlock(>->pcode.lock); + + if (ret) + drm_err(>_to_xe(gt)->drm, + "PCODE initialization timedout after: 3 min\n"); + + return ret; +} + +/** + * xe_pcode_probe - Prepare xe_pcode and also ensure PCODE is initialized. + * @gt: gt instance + * + * This function initializes the xe_pcode component, and when needed, it ensures + * that PCODE has properly performed its initialization and it is really ready + * to go. To be called once only during probe. + * + * It returns 0 on success, and -error number on failure. + */ +int xe_pcode_probe(struct xe_gt *gt) +{ + drmm_mutex_init(>_to_xe(gt)->drm, >->pcode.lock); + + if (gt_to_xe(gt)->info.skip_pcode) + return 0; + + if (!IS_DGFX(gt_to_xe(gt))) + return 0; + + return xe_pcode_init(gt); +} diff --git a/drivers/gpu/drm/xe/xe_pcode.h b/drivers/gpu/drm/xe/xe_pcode.h new file mode 100644 index 000000000000..08cb1d047cba --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pcode.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_PCODE_H_ +#define _XE_PCODE_H_ + +#include <linux/types.h> +struct xe_gt; + +int xe_pcode_probe(struct xe_gt *gt); +int xe_pcode_init(struct xe_gt *gt); +int xe_pcode_init_min_freq_table(struct xe_gt *gt, u32 min_gt_freq, + u32 max_gt_freq); +int xe_pcode_read(struct xe_gt *gt, u32 mbox, u32 *val, u32 *val1); +int xe_pcode_write_timeout(struct xe_gt *gt, u32 mbox, u32 val, + int timeout_ms); +#define xe_pcode_write(gt, mbox, val) \ + xe_pcode_write_timeout(gt, mbox, val, 1) + +int xe_pcode_request(struct xe_gt *gt, u32 mbox, u32 request, + u32 reply_mask, u32 reply, int timeout_ms); + +#define PCODE_MBOX(mbcmd, param1, param2)\ + (FIELD_PREP(PCODE_MB_COMMAND, mbcmd)\ + | FIELD_PREP(PCODE_MB_PARAM1, param1)\ + | FIELD_PREP(PCODE_MB_PARAM2, param2)) + +#endif diff --git a/drivers/gpu/drm/xe/xe_pcode_api.h b/drivers/gpu/drm/xe/xe_pcode_api.h new file mode 100644 index 000000000000..5935cfe30204 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pcode_api.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +/* Internal to xe_pcode */ + +#include "regs/xe_reg_defs.h" + +#define PCODE_MAILBOX XE_REG(0x138124) +#define PCODE_READY REG_BIT(31) +#define PCODE_MB_PARAM2 REG_GENMASK(23, 16) +#define PCODE_MB_PARAM1 REG_GENMASK(15, 8) +#define PCODE_MB_COMMAND REG_GENMASK(7, 0) +#define PCODE_ERROR_MASK 0xFF +#define PCODE_SUCCESS 0x0 +#define PCODE_ILLEGAL_CMD 0x1 +#define PCODE_TIMEOUT 0x2 +#define PCODE_ILLEGAL_DATA 0x3 +#define PCODE_ILLEGAL_SUBCOMMAND 0x4 +#define PCODE_LOCKED 0x6 +#define PCODE_GT_RATIO_OUT_OF_RANGE 0x10 +#define PCODE_REJECTED 0x11 + +#define PCODE_DATA0 XE_REG(0x138128) +#define PCODE_DATA1 XE_REG(0x13812C) + +/* Min Freq QOS Table */ +#define PCODE_WRITE_MIN_FREQ_TABLE 0x8 +#define PCODE_READ_MIN_FREQ_TABLE 0x9 +#define PCODE_FREQ_RING_RATIO_SHIFT 16 + +/* PCODE Init */ +#define DGFX_PCODE_STATUS 0x7E +#define DGFX_GET_INIT_STATUS 0x0 +#define DGFX_INIT_STATUS_COMPLETE 0x1 + +#define PCODE_POWER_SETUP 0x7C +#define POWER_SETUP_SUBCOMMAND_READ_I1 0x4 +#define POWER_SETUP_SUBCOMMAND_WRITE_I1 0x5 +#define POWER_SETUP_I1_WATTS REG_BIT(31) +#define POWER_SETUP_I1_SHIFT 6 /* 10.6 fixed point format */ +#define POWER_SETUP_I1_DATA_MASK REG_GENMASK(15, 0) + +struct pcode_err_decode { + int errno; + const char *str; +}; + diff --git a/drivers/gpu/drm/xe/xe_platform_types.h b/drivers/gpu/drm/xe/xe_platform_types.h new file mode 100644 index 000000000000..553f53dbd093 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_platform_types.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_PLATFORM_INFO_TYPES_H_ +#define _XE_PLATFORM_INFO_TYPES_H_ + +/* + * Keep this in graphics version based order and chronological order within a + * version + */ +enum xe_platform { + XE_PLATFORM_UNINITIALIZED = 0, + XE_TIGERLAKE, + XE_ROCKETLAKE, + XE_ALDERLAKE_S, + XE_ALDERLAKE_P, + XE_ALDERLAKE_N, + XE_DG1, + XE_DG2, + XE_PVC, + XE_METEORLAKE, + XE_LUNARLAKE, +}; + +enum xe_subplatform { + XE_SUBPLATFORM_UNINITIALIZED = 0, + XE_SUBPLATFORM_NONE, + XE_SUBPLATFORM_ALDERLAKE_P_RPLU, + XE_SUBPLATFORM_ALDERLAKE_S_RPLS, + XE_SUBPLATFORM_DG2_G10, + XE_SUBPLATFORM_DG2_G11, + XE_SUBPLATFORM_DG2_G12, +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c new file mode 100644 index 000000000000..b429c2876a76 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pm.c @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_pm.h" + +#include <linux/pm_runtime.h> + +#include <drm/drm_managed.h> +#include <drm/ttm/ttm_placement.h> + +#include "xe_bo.h" +#include "xe_bo_evict.h" +#include "xe_device.h" +#include "xe_device_sysfs.h" +#include "xe_display.h" +#include "xe_ggtt.h" +#include "xe_gt.h" +#include "xe_guc.h" +#include "xe_irq.h" +#include "xe_pcode.h" +#include "xe_wa.h" + +/** + * DOC: Xe Power Management + * + * Xe PM shall be guided by the simplicity. + * Use the simplest hook options whenever possible. + * Let's not reinvent the runtime_pm references and hooks. + * Shall have a clear separation of display and gt underneath this component. + * + * What's next: + * + * For now s2idle and s3 are only working in integrated devices. The next step + * is to iterate through all VRAM's BO backing them up into the system memory + * before allowing the system suspend. + * + * Also runtime_pm needs to be here from the beginning. + * + * RC6/RPS are also critical PM features. Let's start with GuCRC and GuC SLPC + * and no wait boost. Frequency optimizations should come on a next stage. + */ + +/** + * xe_pm_suspend - Helper for System suspend, i.e. S0->S3 / S0->S2idle + * @xe: xe device instance + * + * Return: 0 on success + */ +int xe_pm_suspend(struct xe_device *xe) +{ + struct xe_gt *gt; + u8 id; + int err; + + for_each_gt(gt, xe, id) + xe_gt_suspend_prepare(gt); + + /* FIXME: Super racey... */ + err = xe_bo_evict_all(xe); + if (err) + return err; + + xe_display_pm_suspend(xe); + + for_each_gt(gt, xe, id) { + err = xe_gt_suspend(gt); + if (err) { + xe_display_pm_resume(xe); + return err; + } + } + + xe_irq_suspend(xe); + + xe_display_pm_suspend_late(xe); + + return 0; +} + +/** + * xe_pm_resume - Helper for System resume S3->S0 / S2idle->S0 + * @xe: xe device instance + * + * Return: 0 on success + */ +int xe_pm_resume(struct xe_device *xe) +{ + struct xe_tile *tile; + struct xe_gt *gt; + u8 id; + int err; + + for_each_tile(tile, xe, id) + xe_wa_apply_tile_workarounds(tile); + + for_each_gt(gt, xe, id) { + err = xe_pcode_init(gt); + if (err) + return err; + } + + xe_display_pm_resume_early(xe); + + /* + * This only restores pinned memory which is the memory required for the + * GT(s) to resume. + */ + err = xe_bo_restore_kernel(xe); + if (err) + return err; + + xe_irq_resume(xe); + + xe_display_pm_resume(xe); + + for_each_gt(gt, xe, id) + xe_gt_resume(gt); + + err = xe_bo_restore_user(xe); + if (err) + return err; + + return 0; +} + +static bool xe_pm_pci_d3cold_capable(struct pci_dev *pdev) +{ + struct pci_dev *root_pdev; + + root_pdev = pcie_find_root_port(pdev); + if (!root_pdev) + return false; + + /* D3Cold requires PME capability and _PR3 power resource */ + if (!pci_pme_capable(root_pdev, PCI_D3cold) || !pci_pr3_present(root_pdev)) + return false; + + return true; +} + +static void xe_pm_runtime_init(struct xe_device *xe) +{ + struct device *dev = xe->drm.dev; + + /* + * Disable the system suspend direct complete optimization. + * We need to ensure that the regular device suspend/resume functions + * are called since our runtime_pm cannot guarantee local memory + * eviction for d3cold. + * TODO: Check HDA audio dependencies claimed by i915, and then enforce + * this option to integrated graphics as well. + */ + if (IS_DGFX(xe)) + dev_pm_set_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); + + pm_runtime_use_autosuspend(dev); + pm_runtime_set_autosuspend_delay(dev, 1000); + pm_runtime_set_active(dev); + pm_runtime_allow(dev); + pm_runtime_mark_last_busy(dev); + pm_runtime_put(dev); +} + +void xe_pm_init(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + + /* For now suspend/resume is only allowed with GuC */ + if (!xe_device_uc_enabled(xe)) + return; + + drmm_mutex_init(&xe->drm, &xe->d3cold.lock); + + xe->d3cold.capable = xe_pm_pci_d3cold_capable(pdev); + + if (xe->d3cold.capable) { + xe_device_sysfs_init(xe); + xe_pm_set_vram_threshold(xe, DEFAULT_VRAM_THRESHOLD); + } + + xe_pm_runtime_init(xe); +} + +void xe_pm_runtime_fini(struct xe_device *xe) +{ + struct device *dev = xe->drm.dev; + + pm_runtime_get_sync(dev); + pm_runtime_forbid(dev); +} + +static void xe_pm_write_callback_task(struct xe_device *xe, + struct task_struct *task) +{ + WRITE_ONCE(xe->pm_callback_task, task); + + /* + * Just in case it's somehow possible for our writes to be reordered to + * the extent that something else re-uses the task written in + * pm_callback_task. For example after returning from the callback, but + * before the reordered write that resets pm_callback_task back to NULL. + */ + smp_mb(); /* pairs with xe_pm_read_callback_task */ +} + +struct task_struct *xe_pm_read_callback_task(struct xe_device *xe) +{ + smp_mb(); /* pairs with xe_pm_write_callback_task */ + + return READ_ONCE(xe->pm_callback_task); +} + +int xe_pm_runtime_suspend(struct xe_device *xe) +{ + struct xe_gt *gt; + u8 id; + int err = 0; + + if (xe->d3cold.allowed && xe_device_mem_access_ongoing(xe)) + return -EBUSY; + + /* Disable access_ongoing asserts and prevent recursive pm calls */ + xe_pm_write_callback_task(xe, current); + + /* + * The actual xe_device_mem_access_put() is always async underneath, so + * exactly where that is called should makes no difference to us. However + * we still need to be very careful with the locks that this callback + * acquires and the locks that are acquired and held by any callers of + * xe_device_mem_access_get(). We already have the matching annotation + * on that side, but we also need it here. For example lockdep should be + * able to tell us if the following scenario is in theory possible: + * + * CPU0 | CPU1 (kworker) + * lock(A) | + * | xe_pm_runtime_suspend() + * | lock(A) + * xe_device_mem_access_get() | + * + * This will clearly deadlock since rpm core needs to wait for + * xe_pm_runtime_suspend() to complete, but here we are holding lock(A) + * on CPU0 which prevents CPU1 making forward progress. With the + * annotation here and in xe_device_mem_access_get() lockdep will see + * the potential lock inversion and give us a nice splat. + */ + lock_map_acquire(&xe_device_mem_access_lockdep_map); + + if (xe->d3cold.allowed) { + err = xe_bo_evict_all(xe); + if (err) + goto out; + } + + for_each_gt(gt, xe, id) { + err = xe_gt_suspend(gt); + if (err) + goto out; + } + + xe_irq_suspend(xe); +out: + lock_map_release(&xe_device_mem_access_lockdep_map); + xe_pm_write_callback_task(xe, NULL); + return err; +} + +int xe_pm_runtime_resume(struct xe_device *xe) +{ + struct xe_gt *gt; + u8 id; + int err = 0; + + /* Disable access_ongoing asserts and prevent recursive pm calls */ + xe_pm_write_callback_task(xe, current); + + lock_map_acquire(&xe_device_mem_access_lockdep_map); + + /* + * It can be possible that xe has allowed d3cold but other pcie devices + * in gfx card soc would have blocked d3cold, therefore card has not + * really lost power. Detecting primary Gt power is sufficient. + */ + gt = xe_device_get_gt(xe, 0); + xe->d3cold.power_lost = xe_guc_in_reset(>->uc.guc); + + if (xe->d3cold.allowed && xe->d3cold.power_lost) { + for_each_gt(gt, xe, id) { + err = xe_pcode_init(gt); + if (err) + goto out; + } + + /* + * This only restores pinned memory which is the memory + * required for the GT(s) to resume. + */ + err = xe_bo_restore_kernel(xe); + if (err) + goto out; + } + + xe_irq_resume(xe); + + for_each_gt(gt, xe, id) + xe_gt_resume(gt); + + if (xe->d3cold.allowed && xe->d3cold.power_lost) { + err = xe_bo_restore_user(xe); + if (err) + goto out; + } +out: + lock_map_release(&xe_device_mem_access_lockdep_map); + xe_pm_write_callback_task(xe, NULL); + return err; +} + +int xe_pm_runtime_get(struct xe_device *xe) +{ + return pm_runtime_get_sync(xe->drm.dev); +} + +int xe_pm_runtime_put(struct xe_device *xe) +{ + pm_runtime_mark_last_busy(xe->drm.dev); + return pm_runtime_put(xe->drm.dev); +} + +int xe_pm_runtime_get_if_active(struct xe_device *xe) +{ + return pm_runtime_get_if_active(xe->drm.dev, true); +} + +void xe_pm_assert_unbounded_bridge(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct pci_dev *bridge = pci_upstream_bridge(pdev); + + if (!bridge) + return; + + if (!bridge->driver) { + drm_warn(&xe->drm, "unbounded parent pci bridge, device won't support any PM support.\n"); + device_set_pm_not_required(&pdev->dev); + } +} + +int xe_pm_set_vram_threshold(struct xe_device *xe, u32 threshold) +{ + struct ttm_resource_manager *man; + u32 vram_total_mb = 0; + int i; + + for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) { + man = ttm_manager_type(&xe->ttm, i); + if (man) + vram_total_mb += DIV_ROUND_UP_ULL(man->size, 1024 * 1024); + } + + drm_dbg(&xe->drm, "Total vram %u mb\n", vram_total_mb); + + if (threshold > vram_total_mb) + return -EINVAL; + + mutex_lock(&xe->d3cold.lock); + xe->d3cold.vram_threshold = threshold; + mutex_unlock(&xe->d3cold.lock); + + return 0; +} + +void xe_pm_d3cold_allowed_toggle(struct xe_device *xe) +{ + struct ttm_resource_manager *man; + u32 total_vram_used_mb = 0; + u64 vram_used; + int i; + + if (!xe->d3cold.capable) { + xe->d3cold.allowed = false; + return; + } + + for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) { + man = ttm_manager_type(&xe->ttm, i); + if (man) { + vram_used = ttm_resource_manager_usage(man); + total_vram_used_mb += DIV_ROUND_UP_ULL(vram_used, 1024 * 1024); + } + } + + mutex_lock(&xe->d3cold.lock); + + if (total_vram_used_mb < xe->d3cold.vram_threshold) + xe->d3cold.allowed = true; + else + xe->d3cold.allowed = false; + + mutex_unlock(&xe->d3cold.lock); + + drm_dbg(&xe->drm, + "d3cold: allowed=%s\n", str_yes_no(xe->d3cold.allowed)); +} diff --git a/drivers/gpu/drm/xe/xe_pm.h b/drivers/gpu/drm/xe/xe_pm.h new file mode 100644 index 000000000000..6b9031f7af24 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pm.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_PM_H_ +#define _XE_PM_H_ + +#include <linux/pm_runtime.h> + +/* + * TODO: Threshold = 0 will block D3Cold. + * Before we can move this to a higher value (like 300), we need to: + * 1. rewrite the VRAM save / restore to avoid buffer object locks + */ +#define DEFAULT_VRAM_THRESHOLD 0 /* in MB */ + +struct xe_device; + +int xe_pm_suspend(struct xe_device *xe); +int xe_pm_resume(struct xe_device *xe); + +void xe_pm_init(struct xe_device *xe); +void xe_pm_runtime_fini(struct xe_device *xe); +int xe_pm_runtime_suspend(struct xe_device *xe); +int xe_pm_runtime_resume(struct xe_device *xe); +int xe_pm_runtime_get(struct xe_device *xe); +int xe_pm_runtime_put(struct xe_device *xe); +int xe_pm_runtime_get_if_active(struct xe_device *xe); +void xe_pm_assert_unbounded_bridge(struct xe_device *xe); +int xe_pm_set_vram_threshold(struct xe_device *xe, u32 threshold); +void xe_pm_d3cold_allowed_toggle(struct xe_device *xe); +struct task_struct *xe_pm_read_callback_task(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_preempt_fence.c b/drivers/gpu/drm/xe/xe_preempt_fence.c new file mode 100644 index 000000000000..7bce2a332603 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_preempt_fence.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_preempt_fence.h" + +#include <linux/slab.h> + +#include "xe_exec_queue.h" +#include "xe_vm.h" + +static void preempt_fence_work_func(struct work_struct *w) +{ + bool cookie = dma_fence_begin_signalling(); + struct xe_preempt_fence *pfence = + container_of(w, typeof(*pfence), preempt_work); + struct xe_exec_queue *q = pfence->q; + + if (pfence->error) + dma_fence_set_error(&pfence->base, pfence->error); + else + q->ops->suspend_wait(q); + + dma_fence_signal(&pfence->base); + dma_fence_end_signalling(cookie); + + xe_vm_queue_rebind_worker(q->vm); + + xe_exec_queue_put(q); +} + +static const char * +preempt_fence_get_driver_name(struct dma_fence *fence) +{ + return "xe"; +} + +static const char * +preempt_fence_get_timeline_name(struct dma_fence *fence) +{ + return "preempt"; +} + +static bool preempt_fence_enable_signaling(struct dma_fence *fence) +{ + struct xe_preempt_fence *pfence = + container_of(fence, typeof(*pfence), base); + struct xe_exec_queue *q = pfence->q; + + pfence->error = q->ops->suspend(q); + queue_work(system_unbound_wq, &pfence->preempt_work); + return true; +} + +static const struct dma_fence_ops preempt_fence_ops = { + .get_driver_name = preempt_fence_get_driver_name, + .get_timeline_name = preempt_fence_get_timeline_name, + .enable_signaling = preempt_fence_enable_signaling, +}; + +/** + * xe_preempt_fence_alloc() - Allocate a preempt fence with minimal + * initialization + * + * Allocate a preempt fence, and initialize its list head. + * If the preempt_fence allocated has been armed with + * xe_preempt_fence_arm(), it must be freed using dma_fence_put(). If not, + * it must be freed using xe_preempt_fence_free(). + * + * Return: A struct xe_preempt_fence pointer used for calling into + * xe_preempt_fence_arm() or xe_preempt_fence_free(). + * An error pointer on error. + */ +struct xe_preempt_fence *xe_preempt_fence_alloc(void) +{ + struct xe_preempt_fence *pfence; + + pfence = kmalloc(sizeof(*pfence), GFP_KERNEL); + if (!pfence) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&pfence->link); + INIT_WORK(&pfence->preempt_work, preempt_fence_work_func); + + return pfence; +} + +/** + * xe_preempt_fence_free() - Free a preempt fence allocated using + * xe_preempt_fence_alloc(). + * @pfence: pointer obtained from xe_preempt_fence_alloc(); + * + * Free a preempt fence that has not yet been armed. + */ +void xe_preempt_fence_free(struct xe_preempt_fence *pfence) +{ + list_del(&pfence->link); + kfree(pfence); +} + +/** + * xe_preempt_fence_arm() - Arm a preempt fence allocated using + * xe_preempt_fence_alloc(). + * @pfence: The struct xe_preempt_fence pointer returned from + * xe_preempt_fence_alloc(). + * @q: The struct xe_exec_queue used for arming. + * @context: The dma-fence context used for arming. + * @seqno: The dma-fence seqno used for arming. + * + * Inserts the preempt fence into @context's timeline, takes @link off any + * list, and registers the struct xe_exec_queue as the xe_engine to be preempted. + * + * Return: A pointer to a struct dma_fence embedded into the preempt fence. + * This function doesn't error. + */ +struct dma_fence * +xe_preempt_fence_arm(struct xe_preempt_fence *pfence, struct xe_exec_queue *q, + u64 context, u32 seqno) +{ + list_del_init(&pfence->link); + pfence->q = xe_exec_queue_get(q); + dma_fence_init(&pfence->base, &preempt_fence_ops, + &q->compute.lock, context, seqno); + + return &pfence->base; +} + +/** + * xe_preempt_fence_create() - Helper to create and arm a preempt fence. + * @q: The struct xe_exec_queue used for arming. + * @context: The dma-fence context used for arming. + * @seqno: The dma-fence seqno used for arming. + * + * Allocates and inserts the preempt fence into @context's timeline, + * and registers @e as the struct xe_exec_queue to be preempted. + * + * Return: A pointer to the resulting struct dma_fence on success. An error + * pointer on error. In particular if allocation fails it returns + * ERR_PTR(-ENOMEM); + */ +struct dma_fence * +xe_preempt_fence_create(struct xe_exec_queue *q, + u64 context, u32 seqno) +{ + struct xe_preempt_fence *pfence; + + pfence = xe_preempt_fence_alloc(); + if (IS_ERR(pfence)) + return ERR_CAST(pfence); + + return xe_preempt_fence_arm(pfence, q, context, seqno); +} + +bool xe_fence_is_xe_preempt(const struct dma_fence *fence) +{ + return fence->ops == &preempt_fence_ops; +} diff --git a/drivers/gpu/drm/xe/xe_preempt_fence.h b/drivers/gpu/drm/xe/xe_preempt_fence.h new file mode 100644 index 000000000000..9406c6fea525 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_preempt_fence.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_PREEMPT_FENCE_H_ +#define _XE_PREEMPT_FENCE_H_ + +#include "xe_preempt_fence_types.h" + +struct list_head; + +struct dma_fence * +xe_preempt_fence_create(struct xe_exec_queue *q, + u64 context, u32 seqno); + +struct xe_preempt_fence *xe_preempt_fence_alloc(void); + +void xe_preempt_fence_free(struct xe_preempt_fence *pfence); + +struct dma_fence * +xe_preempt_fence_arm(struct xe_preempt_fence *pfence, struct xe_exec_queue *q, + u64 context, u32 seqno); + +static inline struct xe_preempt_fence * +to_preempt_fence(struct dma_fence *fence) +{ + return container_of(fence, struct xe_preempt_fence, base); +} + +/** + * xe_preempt_fence_link() - Return a link used to keep unarmed preempt + * fences on a list. + * @pfence: Pointer to the preempt fence. + * + * The link is embedded in the struct xe_preempt_fence. Use + * link_to_preempt_fence() to convert back to the preempt fence. + * + * Return: A pointer to an embedded struct list_head. + */ +static inline struct list_head * +xe_preempt_fence_link(struct xe_preempt_fence *pfence) +{ + return &pfence->link; +} + +/** + * to_preempt_fence_from_link() - Convert back to a preempt fence pointer + * from a link obtained with xe_preempt_fence_link(). + * @link: The struct list_head obtained from xe_preempt_fence_link(). + * + * Return: A pointer to the embedding struct xe_preempt_fence. + */ +static inline struct xe_preempt_fence * +to_preempt_fence_from_link(struct list_head *link) +{ + return container_of(link, struct xe_preempt_fence, link); +} + +bool xe_fence_is_xe_preempt(const struct dma_fence *fence); +#endif diff --git a/drivers/gpu/drm/xe/xe_preempt_fence_types.h b/drivers/gpu/drm/xe/xe_preempt_fence_types.h new file mode 100644 index 000000000000..b54b5c29b533 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_preempt_fence_types.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_PREEMPT_FENCE_TYPES_H_ +#define _XE_PREEMPT_FENCE_TYPES_H_ + +#include <linux/dma-fence.h> +#include <linux/workqueue.h> + +struct xe_exec_queue; + +/** + * struct xe_preempt_fence - XE preempt fence + * + * hardware and triggers a callback once the xe_engine is complete. + */ +struct xe_preempt_fence { + /** @base: dma fence base */ + struct dma_fence base; + /** @link: link into list of pending preempt fences */ + struct list_head link; + /** @q: exec queue for this preempt fence */ + struct xe_exec_queue *q; + /** @preempt_work: work struct which issues preemption */ + struct work_struct preempt_work; + /** @error: preempt fence is in error state */ + int error; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c new file mode 100644 index 000000000000..de1030a47588 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -0,0 +1,1653 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_pt.h" + +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_drm_client.h" +#include "xe_gt.h" +#include "xe_gt_tlb_invalidation.h" +#include "xe_migrate.h" +#include "xe_pt_types.h" +#include "xe_pt_walk.h" +#include "xe_res_cursor.h" +#include "xe_trace.h" +#include "xe_ttm_stolen_mgr.h" +#include "xe_vm.h" + +struct xe_pt_dir { + struct xe_pt pt; + /** @dir: Directory structure for the xe_pt_walk functionality */ + struct xe_ptw_dir dir; +}; + +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM) +#define xe_pt_set_addr(__xe_pt, __addr) ((__xe_pt)->addr = (__addr)) +#define xe_pt_addr(__xe_pt) ((__xe_pt)->addr) +#else +#define xe_pt_set_addr(__xe_pt, __addr) +#define xe_pt_addr(__xe_pt) 0ull +#endif + +static const u64 xe_normal_pt_shifts[] = {12, 21, 30, 39, 48}; +static const u64 xe_compact_pt_shifts[] = {16, 21, 30, 39, 48}; + +#define XE_PT_HIGHEST_LEVEL (ARRAY_SIZE(xe_normal_pt_shifts) - 1) + +static struct xe_pt_dir *as_xe_pt_dir(struct xe_pt *pt) +{ + return container_of(pt, struct xe_pt_dir, pt); +} + +static struct xe_pt *xe_pt_entry(struct xe_pt_dir *pt_dir, unsigned int index) +{ + return container_of(pt_dir->dir.entries[index], struct xe_pt, base); +} + +static u64 __xe_pt_empty_pte(struct xe_tile *tile, struct xe_vm *vm, + unsigned int level) +{ + struct xe_device *xe = tile_to_xe(tile); + u16 pat_index = xe->pat.idx[XE_CACHE_WB]; + u8 id = tile->id; + + if (!xe_vm_has_scratch(vm)) + return 0; + + if (level > MAX_HUGEPTE_LEVEL) + return vm->pt_ops->pde_encode_bo(vm->scratch_pt[id][level - 1]->bo, + 0, pat_index); + + return vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) | + XE_PTE_NULL; +} + +/** + * xe_pt_create() - Create a page-table. + * @vm: The vm to create for. + * @tile: The tile to create for. + * @level: The page-table level. + * + * Allocate and initialize a single struct xe_pt metadata structure. Also + * create the corresponding page-table bo, but don't initialize it. If the + * level is grater than zero, then it's assumed to be a directory page- + * table and the directory structure is also allocated and initialized to + * NULL pointers. + * + * Return: A valid struct xe_pt pointer on success, Pointer error code on + * error. + */ +struct xe_pt *xe_pt_create(struct xe_vm *vm, struct xe_tile *tile, + unsigned int level) +{ + struct xe_pt *pt; + struct xe_bo *bo; + size_t size; + int err; + + size = !level ? sizeof(struct xe_pt) : sizeof(struct xe_pt_dir) + + XE_PDES * sizeof(struct xe_ptw *); + pt = kzalloc(size, GFP_KERNEL); + if (!pt) + return ERR_PTR(-ENOMEM); + + bo = xe_bo_create_pin_map(vm->xe, tile, vm, SZ_4K, + ttm_bo_type_kernel, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_IGNORE_MIN_PAGE_SIZE_BIT | + XE_BO_CREATE_PINNED_BIT | + XE_BO_CREATE_NO_RESV_EVICT | + XE_BO_PAGETABLE); + if (IS_ERR(bo)) { + err = PTR_ERR(bo); + goto err_kfree; + } + pt->bo = bo; + pt->level = level; + pt->base.dir = level ? &as_xe_pt_dir(pt)->dir : NULL; + + if (vm->xef) + xe_drm_client_add_bo(vm->xef->client, pt->bo); + xe_tile_assert(tile, level <= XE_VM_MAX_LEVEL); + + return pt; + +err_kfree: + kfree(pt); + return ERR_PTR(err); +} + +/** + * xe_pt_populate_empty() - Populate a page-table bo with scratch- or zero + * entries. + * @tile: The tile the scratch pagetable of which to use. + * @vm: The vm we populate for. + * @pt: The pagetable the bo of which to initialize. + * + * Populate the page-table bo of @pt with entries pointing into the tile's + * scratch page-table tree if any. Otherwise populate with zeros. + */ +void xe_pt_populate_empty(struct xe_tile *tile, struct xe_vm *vm, + struct xe_pt *pt) +{ + struct iosys_map *map = &pt->bo->vmap; + u64 empty; + int i; + + if (!xe_vm_has_scratch(vm)) { + /* + * FIXME: Some memory is allocated already allocated to zero? + * Find out which memory that is and avoid this memset... + */ + xe_map_memset(vm->xe, map, 0, 0, SZ_4K); + } else { + empty = __xe_pt_empty_pte(tile, vm, pt->level); + for (i = 0; i < XE_PDES; i++) + xe_pt_write(vm->xe, map, i, empty); + } +} + +/** + * xe_pt_shift() - Return the ilog2 value of the size of the address range of + * a page-table at a certain level. + * @level: The level. + * + * Return: The ilog2 value of the size of the address range of a page-table + * at level @level. + */ +unsigned int xe_pt_shift(unsigned int level) +{ + return XE_PTE_SHIFT + XE_PDE_SHIFT * level; +} + +/** + * xe_pt_destroy() - Destroy a page-table tree. + * @pt: The root of the page-table tree to destroy. + * @flags: vm flags. Currently unused. + * @deferred: List head of lockless list for deferred putting. NULL for + * immediate putting. + * + * Puts the page-table bo, recursively calls xe_pt_destroy on all children + * and finally frees @pt. TODO: Can we remove the @flags argument? + */ +void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred) +{ + int i; + + if (!pt) + return; + + XE_WARN_ON(!list_empty(&pt->bo->ttm.base.gpuva.list)); + xe_bo_unpin(pt->bo); + xe_bo_put_deferred(pt->bo, deferred); + + if (pt->level > 0 && pt->num_live) { + struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt); + + for (i = 0; i < XE_PDES; i++) { + if (xe_pt_entry(pt_dir, i)) + xe_pt_destroy(xe_pt_entry(pt_dir, i), flags, + deferred); + } + } + kfree(pt); +} + +/** + * DOC: Pagetable building + * + * Below we use the term "page-table" for both page-directories, containing + * pointers to lower level page-directories or page-tables, and level 0 + * page-tables that contain only page-table-entries pointing to memory pages. + * + * When inserting an address range in an already existing page-table tree + * there will typically be a set of page-tables that are shared with other + * address ranges, and a set that are private to this address range. + * The set of shared page-tables can be at most two per level, + * and those can't be updated immediately because the entries of those + * page-tables may still be in use by the gpu for other mappings. Therefore + * when inserting entries into those, we instead stage those insertions by + * adding insertion data into struct xe_vm_pgtable_update structures. This + * data, (subtrees for the cpu and page-table-entries for the gpu) is then + * added in a separate commit step. CPU-data is committed while still under the + * vm lock, the object lock and for userptr, the notifier lock in read mode. + * The GPU async data is committed either by the GPU or CPU after fulfilling + * relevant dependencies. + * For non-shared page-tables (and, in fact, for shared ones that aren't + * existing at the time of staging), we add the data in-place without the + * special update structures. This private part of the page-table tree will + * remain disconnected from the vm page-table tree until data is committed to + * the shared page tables of the vm tree in the commit phase. + */ + +struct xe_pt_update { + /** @update: The update structure we're building for this parent. */ + struct xe_vm_pgtable_update *update; + /** @parent: The parent. Used to detect a parent change. */ + struct xe_pt *parent; + /** @preexisting: Whether the parent was pre-existing or allocated */ + bool preexisting; +}; + +struct xe_pt_stage_bind_walk { + /** base: The base class. */ + struct xe_pt_walk base; + + /* Input parameters for the walk */ + /** @vm: The vm we're building for. */ + struct xe_vm *vm; + /** @tile: The tile we're building for. */ + struct xe_tile *tile; + /** @default_pte: PTE flag only template. No address is associated */ + u64 default_pte; + /** @dma_offset: DMA offset to add to the PTE. */ + u64 dma_offset; + /** + * @needs_64k: This address range enforces 64K alignment and + * granularity. + */ + bool needs_64K; + /** + * @vma: VMA being mapped + */ + struct xe_vma *vma; + + /* Also input, but is updated during the walk*/ + /** @curs: The DMA address cursor. */ + struct xe_res_cursor *curs; + /** @va_curs_start: The Virtual address coresponding to @curs->start */ + u64 va_curs_start; + + /* Output */ + struct xe_walk_update { + /** @wupd.entries: Caller provided storage. */ + struct xe_vm_pgtable_update *entries; + /** @wupd.num_used_entries: Number of update @entries used. */ + unsigned int num_used_entries; + /** @wupd.updates: Tracks the update entry at a given level */ + struct xe_pt_update updates[XE_VM_MAX_LEVEL + 1]; + } wupd; + + /* Walk state */ + /** + * @l0_end_addr: The end address of the current l0 leaf. Used for + * 64K granularity detection. + */ + u64 l0_end_addr; + /** @addr_64K: The start address of the current 64K chunk. */ + u64 addr_64K; + /** @found_64: Whether @add_64K actually points to a 64K chunk. */ + bool found_64K; +}; + +static int +xe_pt_new_shared(struct xe_walk_update *wupd, struct xe_pt *parent, + pgoff_t offset, bool alloc_entries) +{ + struct xe_pt_update *upd = &wupd->updates[parent->level]; + struct xe_vm_pgtable_update *entry; + + /* + * For *each level*, we could only have one active + * struct xt_pt_update at any one time. Once we move on to a + * new parent and page-directory, the old one is complete, and + * updates are either already stored in the build tree or in + * @wupd->entries + */ + if (likely(upd->parent == parent)) + return 0; + + upd->parent = parent; + upd->preexisting = true; + + if (wupd->num_used_entries == XE_VM_MAX_LEVEL * 2 + 1) + return -EINVAL; + + entry = wupd->entries + wupd->num_used_entries++; + upd->update = entry; + entry->ofs = offset; + entry->pt_bo = parent->bo; + entry->pt = parent; + entry->flags = 0; + entry->qwords = 0; + + if (alloc_entries) { + entry->pt_entries = kmalloc_array(XE_PDES, + sizeof(*entry->pt_entries), + GFP_KERNEL); + if (!entry->pt_entries) + return -ENOMEM; + } + + return 0; +} + +/* + * NOTE: This is a very frequently called function so we allow ourselves + * to annotate (using branch prediction hints) the fastpath of updating a + * non-pre-existing pagetable with leaf ptes. + */ +static int +xe_pt_insert_entry(struct xe_pt_stage_bind_walk *xe_walk, struct xe_pt *parent, + pgoff_t offset, struct xe_pt *xe_child, u64 pte) +{ + struct xe_pt_update *upd = &xe_walk->wupd.updates[parent->level]; + struct xe_pt_update *child_upd = xe_child ? + &xe_walk->wupd.updates[xe_child->level] : NULL; + int ret; + + ret = xe_pt_new_shared(&xe_walk->wupd, parent, offset, true); + if (unlikely(ret)) + return ret; + + /* + * Register this new pagetable so that it won't be recognized as + * a shared pagetable by a subsequent insertion. + */ + if (unlikely(child_upd)) { + child_upd->update = NULL; + child_upd->parent = xe_child; + child_upd->preexisting = false; + } + + if (likely(!upd->preexisting)) { + /* Continue building a non-connected subtree. */ + struct iosys_map *map = &parent->bo->vmap; + + if (unlikely(xe_child)) + parent->base.dir->entries[offset] = &xe_child->base; + + xe_pt_write(xe_walk->vm->xe, map, offset, pte); + parent->num_live++; + } else { + /* Shared pt. Stage update. */ + unsigned int idx; + struct xe_vm_pgtable_update *entry = upd->update; + + idx = offset - entry->ofs; + entry->pt_entries[idx].pt = xe_child; + entry->pt_entries[idx].pte = pte; + entry->qwords++; + } + + return 0; +} + +static bool xe_pt_hugepte_possible(u64 addr, u64 next, unsigned int level, + struct xe_pt_stage_bind_walk *xe_walk) +{ + u64 size, dma; + + if (level > MAX_HUGEPTE_LEVEL) + return false; + + /* Does the virtual range requested cover a huge pte? */ + if (!xe_pt_covers(addr, next, level, &xe_walk->base)) + return false; + + /* Does the DMA segment cover the whole pte? */ + if (next - xe_walk->va_curs_start > xe_walk->curs->size) + return false; + + /* null VMA's do not have dma addresses */ + if (xe_vma_is_null(xe_walk->vma)) + return true; + + /* Is the DMA address huge PTE size aligned? */ + size = next - addr; + dma = addr - xe_walk->va_curs_start + xe_res_dma(xe_walk->curs); + + return IS_ALIGNED(dma, size); +} + +/* + * Scan the requested mapping to check whether it can be done entirely + * with 64K PTEs. + */ +static bool +xe_pt_scan_64K(u64 addr, u64 next, struct xe_pt_stage_bind_walk *xe_walk) +{ + struct xe_res_cursor curs = *xe_walk->curs; + + if (!IS_ALIGNED(addr, SZ_64K)) + return false; + + if (next > xe_walk->l0_end_addr) + return false; + + /* null VMA's do not have dma addresses */ + if (xe_vma_is_null(xe_walk->vma)) + return true; + + xe_res_next(&curs, addr - xe_walk->va_curs_start); + for (; addr < next; addr += SZ_64K) { + if (!IS_ALIGNED(xe_res_dma(&curs), SZ_64K) || curs.size < SZ_64K) + return false; + + xe_res_next(&curs, SZ_64K); + } + + return addr == next; +} + +/* + * For non-compact "normal" 4K level-0 pagetables, we want to try to group + * addresses together in 64K-contigous regions to add a 64K TLB hint for the + * device to the PTE. + * This function determines whether the address is part of such a + * segment. For VRAM in normal pagetables, this is strictly necessary on + * some devices. + */ +static bool +xe_pt_is_pte_ps64K(u64 addr, u64 next, struct xe_pt_stage_bind_walk *xe_walk) +{ + /* Address is within an already found 64k region */ + if (xe_walk->found_64K && addr - xe_walk->addr_64K < SZ_64K) + return true; + + xe_walk->found_64K = xe_pt_scan_64K(addr, addr + SZ_64K, xe_walk); + xe_walk->addr_64K = addr; + + return xe_walk->found_64K; +} + +static int +xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset, + unsigned int level, u64 addr, u64 next, + struct xe_ptw **child, + enum page_walk_action *action, + struct xe_pt_walk *walk) +{ + struct xe_pt_stage_bind_walk *xe_walk = + container_of(walk, typeof(*xe_walk), base); + u16 pat_index = xe_walk->vma->pat_index; + struct xe_pt *xe_parent = container_of(parent, typeof(*xe_parent), base); + struct xe_vm *vm = xe_walk->vm; + struct xe_pt *xe_child; + bool covers; + int ret = 0; + u64 pte; + + /* Is this a leaf entry ?*/ + if (level == 0 || xe_pt_hugepte_possible(addr, next, level, xe_walk)) { + struct xe_res_cursor *curs = xe_walk->curs; + bool is_null = xe_vma_is_null(xe_walk->vma); + + XE_WARN_ON(xe_walk->va_curs_start != addr); + + pte = vm->pt_ops->pte_encode_vma(is_null ? 0 : + xe_res_dma(curs) + xe_walk->dma_offset, + xe_walk->vma, pat_index, level); + pte |= xe_walk->default_pte; + + /* + * Set the XE_PTE_PS64 hint if possible, otherwise if + * this device *requires* 64K PTE size for VRAM, fail. + */ + if (level == 0 && !xe_parent->is_compact) { + if (xe_pt_is_pte_ps64K(addr, next, xe_walk)) + pte |= XE_PTE_PS64; + else if (XE_WARN_ON(xe_walk->needs_64K)) + return -EINVAL; + } + + ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, NULL, pte); + if (unlikely(ret)) + return ret; + + if (!is_null) + xe_res_next(curs, next - addr); + xe_walk->va_curs_start = next; + xe_walk->vma->gpuva.flags |= (XE_VMA_PTE_4K << level); + *action = ACTION_CONTINUE; + + return ret; + } + + /* + * Descending to lower level. Determine if we need to allocate a + * new page table or -directory, which we do if there is no + * previous one or there is one we can completely replace. + */ + if (level == 1) { + walk->shifts = xe_normal_pt_shifts; + xe_walk->l0_end_addr = next; + } + + covers = xe_pt_covers(addr, next, level, &xe_walk->base); + if (covers || !*child) { + u64 flags = 0; + + xe_child = xe_pt_create(xe_walk->vm, xe_walk->tile, level - 1); + if (IS_ERR(xe_child)) + return PTR_ERR(xe_child); + + xe_pt_set_addr(xe_child, + round_down(addr, 1ull << walk->shifts[level])); + + if (!covers) + xe_pt_populate_empty(xe_walk->tile, xe_walk->vm, xe_child); + + *child = &xe_child->base; + + /* + * Prefer the compact pagetable layout for L0 if possible. + * TODO: Suballocate the pt bo to avoid wasting a lot of + * memory. + */ + if (GRAPHICS_VERx100(tile_to_xe(xe_walk->tile)) >= 1250 && level == 1 && + covers && xe_pt_scan_64K(addr, next, xe_walk)) { + walk->shifts = xe_compact_pt_shifts; + flags |= XE_PDE_64K; + xe_child->is_compact = true; + } + + pte = vm->pt_ops->pde_encode_bo(xe_child->bo, 0, pat_index) | flags; + ret = xe_pt_insert_entry(xe_walk, xe_parent, offset, xe_child, + pte); + } + + *action = ACTION_SUBTREE; + return ret; +} + +static const struct xe_pt_walk_ops xe_pt_stage_bind_ops = { + .pt_entry = xe_pt_stage_bind_entry, +}; + +/** + * xe_pt_stage_bind() - Build a disconnected page-table tree for a given address + * range. + * @tile: The tile we're building for. + * @vma: The vma indicating the address range. + * @entries: Storage for the update entries used for connecting the tree to + * the main tree at commit time. + * @num_entries: On output contains the number of @entries used. + * + * This function builds a disconnected page-table tree for a given address + * range. The tree is connected to the main vm tree for the gpu using + * xe_migrate_update_pgtables() and for the cpu using xe_pt_commit_bind(). + * The function builds xe_vm_pgtable_update structures for already existing + * shared page-tables, and non-existing shared and non-shared page-tables + * are built and populated directly. + * + * Return 0 on success, negative error code on error. + */ +static int +xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma, + struct xe_vm_pgtable_update *entries, u32 *num_entries) +{ + struct xe_device *xe = tile_to_xe(tile); + struct xe_bo *bo = xe_vma_bo(vma); + bool is_devmem = !xe_vma_is_userptr(vma) && bo && + (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo)); + struct xe_res_cursor curs; + struct xe_pt_stage_bind_walk xe_walk = { + .base = { + .ops = &xe_pt_stage_bind_ops, + .shifts = xe_normal_pt_shifts, + .max_level = XE_PT_HIGHEST_LEVEL, + }, + .vm = xe_vma_vm(vma), + .tile = tile, + .curs = &curs, + .va_curs_start = xe_vma_start(vma), + .vma = vma, + .wupd.entries = entries, + .needs_64K = (xe_vma_vm(vma)->flags & XE_VM_FLAG_64K) && is_devmem, + }; + struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id]; + int ret; + + if (vma && (vma->gpuva.flags & XE_VMA_ATOMIC_PTE_BIT) && + (is_devmem || !IS_DGFX(xe))) + xe_walk.default_pte |= XE_USM_PPGTT_PTE_AE; + + if (is_devmem) { + xe_walk.default_pte |= XE_PPGTT_PTE_DM; + xe_walk.dma_offset = vram_region_gpu_offset(bo->ttm.resource); + } + + if (!xe_vma_has_no_bo(vma) && xe_bo_is_stolen(bo)) + xe_walk.dma_offset = xe_ttm_stolen_gpu_offset(xe_bo_device(bo)); + + xe_bo_assert_held(bo); + + if (!xe_vma_is_null(vma)) { + if (xe_vma_is_userptr(vma)) + xe_res_first_sg(vma->userptr.sg, 0, xe_vma_size(vma), + &curs); + else if (xe_bo_is_vram(bo) || xe_bo_is_stolen(bo)) + xe_res_first(bo->ttm.resource, xe_vma_bo_offset(vma), + xe_vma_size(vma), &curs); + else + xe_res_first_sg(xe_bo_sg(bo), xe_vma_bo_offset(vma), + xe_vma_size(vma), &curs); + } else { + curs.size = xe_vma_size(vma); + } + + ret = xe_pt_walk_range(&pt->base, pt->level, xe_vma_start(vma), + xe_vma_end(vma), &xe_walk.base); + + *num_entries = xe_walk.wupd.num_used_entries; + return ret; +} + +/** + * xe_pt_nonshared_offsets() - Determine the non-shared entry offsets of a + * shared pagetable. + * @addr: The start address within the non-shared pagetable. + * @end: The end address within the non-shared pagetable. + * @level: The level of the non-shared pagetable. + * @walk: Walk info. The function adjusts the walk action. + * @action: next action to perform (see enum page_walk_action) + * @offset: Ignored on input, First non-shared entry on output. + * @end_offset: Ignored on input, Last non-shared entry + 1 on output. + * + * A non-shared page-table has some entries that belong to the address range + * and others that don't. This function determines the entries that belong + * fully to the address range. Depending on level, some entries may + * partially belong to the address range (that can't happen at level 0). + * The function detects that and adjust those offsets to not include those + * partial entries. Iff it does detect partial entries, we know that there must + * be shared page tables also at lower levels, so it adjusts the walk action + * accordingly. + * + * Return: true if there were non-shared entries, false otherwise. + */ +static bool xe_pt_nonshared_offsets(u64 addr, u64 end, unsigned int level, + struct xe_pt_walk *walk, + enum page_walk_action *action, + pgoff_t *offset, pgoff_t *end_offset) +{ + u64 size = 1ull << walk->shifts[level]; + + *offset = xe_pt_offset(addr, level, walk); + *end_offset = xe_pt_num_entries(addr, end, level, walk) + *offset; + + if (!level) + return true; + + /* + * If addr or next are not size aligned, there are shared pts at lower + * level, so in that case traverse down the subtree + */ + *action = ACTION_CONTINUE; + if (!IS_ALIGNED(addr, size)) { + *action = ACTION_SUBTREE; + (*offset)++; + } + + if (!IS_ALIGNED(end, size)) { + *action = ACTION_SUBTREE; + (*end_offset)--; + } + + return *end_offset > *offset; +} + +struct xe_pt_zap_ptes_walk { + /** @base: The walk base-class */ + struct xe_pt_walk base; + + /* Input parameters for the walk */ + /** @tile: The tile we're building for */ + struct xe_tile *tile; + + /* Output */ + /** @needs_invalidate: Whether we need to invalidate TLB*/ + bool needs_invalidate; +}; + +static int xe_pt_zap_ptes_entry(struct xe_ptw *parent, pgoff_t offset, + unsigned int level, u64 addr, u64 next, + struct xe_ptw **child, + enum page_walk_action *action, + struct xe_pt_walk *walk) +{ + struct xe_pt_zap_ptes_walk *xe_walk = + container_of(walk, typeof(*xe_walk), base); + struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base); + pgoff_t end_offset; + + XE_WARN_ON(!*child); + XE_WARN_ON(!level && xe_child->is_compact); + + /* + * Note that we're called from an entry callback, and we're dealing + * with the child of that entry rather than the parent, so need to + * adjust level down. + */ + if (xe_pt_nonshared_offsets(addr, next, --level, walk, action, &offset, + &end_offset)) { + xe_map_memset(tile_to_xe(xe_walk->tile), &xe_child->bo->vmap, + offset * sizeof(u64), 0, + (end_offset - offset) * sizeof(u64)); + xe_walk->needs_invalidate = true; + } + + return 0; +} + +static const struct xe_pt_walk_ops xe_pt_zap_ptes_ops = { + .pt_entry = xe_pt_zap_ptes_entry, +}; + +/** + * xe_pt_zap_ptes() - Zap (zero) gpu ptes of an address range + * @tile: The tile we're zapping for. + * @vma: GPU VMA detailing address range. + * + * Eviction and Userptr invalidation needs to be able to zap the + * gpu ptes of a given address range in pagefaulting mode. + * In order to be able to do that, that function needs access to the shared + * page-table entrieaso it can either clear the leaf PTEs or + * clear the pointers to lower-level page-tables. The caller is required + * to hold the necessary locks to ensure neither the page-table connectivity + * nor the page-table entries of the range is updated from under us. + * + * Return: Whether ptes were actually updated and a TLB invalidation is + * required. + */ +bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma) +{ + struct xe_pt_zap_ptes_walk xe_walk = { + .base = { + .ops = &xe_pt_zap_ptes_ops, + .shifts = xe_normal_pt_shifts, + .max_level = XE_PT_HIGHEST_LEVEL, + }, + .tile = tile, + }; + struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id]; + + if (!(vma->tile_present & BIT(tile->id))) + return false; + + (void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma), + xe_vma_end(vma), &xe_walk.base); + + return xe_walk.needs_invalidate; +} + +static void +xe_vm_populate_pgtable(struct xe_migrate_pt_update *pt_update, struct xe_tile *tile, + struct iosys_map *map, void *data, + u32 qword_ofs, u32 num_qwords, + const struct xe_vm_pgtable_update *update) +{ + struct xe_pt_entry *ptes = update->pt_entries; + u64 *ptr = data; + u32 i; + + for (i = 0; i < num_qwords; i++) { + if (map) + xe_map_wr(tile_to_xe(tile), map, (qword_ofs + i) * + sizeof(u64), u64, ptes[i].pte); + else + ptr[i] = ptes[i].pte; + } +} + +static void xe_pt_abort_bind(struct xe_vma *vma, + struct xe_vm_pgtable_update *entries, + u32 num_entries) +{ + u32 i, j; + + for (i = 0; i < num_entries; i++) { + if (!entries[i].pt_entries) + continue; + + for (j = 0; j < entries[i].qwords; j++) + xe_pt_destroy(entries[i].pt_entries[j].pt, xe_vma_vm(vma)->flags, NULL); + kfree(entries[i].pt_entries); + } +} + +static void xe_pt_commit_locks_assert(struct xe_vma *vma) +{ + struct xe_vm *vm = xe_vma_vm(vma); + + lockdep_assert_held(&vm->lock); + + if (xe_vma_is_userptr(vma)) + lockdep_assert_held_read(&vm->userptr.notifier_lock); + else if (!xe_vma_is_null(vma)) + dma_resv_assert_held(xe_vma_bo(vma)->ttm.base.resv); + + xe_vm_assert_held(vm); +} + +static void xe_pt_commit_bind(struct xe_vma *vma, + struct xe_vm_pgtable_update *entries, + u32 num_entries, bool rebind, + struct llist_head *deferred) +{ + u32 i, j; + + xe_pt_commit_locks_assert(vma); + + for (i = 0; i < num_entries; i++) { + struct xe_pt *pt = entries[i].pt; + struct xe_pt_dir *pt_dir; + + if (!rebind) + pt->num_live += entries[i].qwords; + + if (!pt->level) { + kfree(entries[i].pt_entries); + continue; + } + + pt_dir = as_xe_pt_dir(pt); + for (j = 0; j < entries[i].qwords; j++) { + u32 j_ = j + entries[i].ofs; + struct xe_pt *newpte = entries[i].pt_entries[j].pt; + + if (xe_pt_entry(pt_dir, j_)) + xe_pt_destroy(xe_pt_entry(pt_dir, j_), + xe_vma_vm(vma)->flags, deferred); + + pt_dir->dir.entries[j_] = &newpte->base; + } + kfree(entries[i].pt_entries); + } +} + +static int +xe_pt_prepare_bind(struct xe_tile *tile, struct xe_vma *vma, + struct xe_vm_pgtable_update *entries, u32 *num_entries, + bool rebind) +{ + int err; + + *num_entries = 0; + err = xe_pt_stage_bind(tile, vma, entries, num_entries); + if (!err) + xe_tile_assert(tile, *num_entries); + else /* abort! */ + xe_pt_abort_bind(vma, entries, *num_entries); + + return err; +} + +static void xe_vm_dbg_print_entries(struct xe_device *xe, + const struct xe_vm_pgtable_update *entries, + unsigned int num_entries) +#if (IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)) +{ + unsigned int i; + + vm_dbg(&xe->drm, "%u entries to update\n", num_entries); + for (i = 0; i < num_entries; i++) { + const struct xe_vm_pgtable_update *entry = &entries[i]; + struct xe_pt *xe_pt = entry->pt; + u64 page_size = 1ull << xe_pt_shift(xe_pt->level); + u64 end; + u64 start; + + xe_assert(xe, !entry->pt->is_compact); + start = entry->ofs * page_size; + end = start + page_size * entry->qwords; + vm_dbg(&xe->drm, + "\t%u: Update level %u at (%u + %u) [%llx...%llx) f:%x\n", + i, xe_pt->level, entry->ofs, entry->qwords, + xe_pt_addr(xe_pt) + start, xe_pt_addr(xe_pt) + end, 0); + } +} +#else +{} +#endif + +#ifdef CONFIG_DRM_XE_USERPTR_INVAL_INJECT + +static int xe_pt_userptr_inject_eagain(struct xe_vma *vma) +{ + u32 divisor = vma->userptr.divisor ? vma->userptr.divisor : 2; + static u32 count; + + if (count++ % divisor == divisor - 1) { + struct xe_vm *vm = xe_vma_vm(vma); + + vma->userptr.divisor = divisor << 1; + spin_lock(&vm->userptr.invalidated_lock); + list_move_tail(&vma->userptr.invalidate_link, + &vm->userptr.invalidated); + spin_unlock(&vm->userptr.invalidated_lock); + return true; + } + + return false; +} + +#else + +static bool xe_pt_userptr_inject_eagain(struct xe_vma *vma) +{ + return false; +} + +#endif + +/** + * struct xe_pt_migrate_pt_update - Callback argument for pre-commit callbacks + * @base: Base we derive from. + * @bind: Whether this is a bind or an unbind operation. A bind operation + * makes the pre-commit callback error with -EAGAIN if it detects a + * pending invalidation. + * @locked: Whether the pre-commit callback locked the userptr notifier lock + * and it needs unlocking. + */ +struct xe_pt_migrate_pt_update { + struct xe_migrate_pt_update base; + bool bind; + bool locked; +}; + +/* + * This function adds the needed dependencies to a page-table update job + * to make sure racing jobs for separate bind engines don't race writing + * to the same page-table range, wreaking havoc. Initially use a single + * fence for the entire VM. An optimization would use smaller granularity. + */ +static int xe_pt_vm_dependencies(struct xe_sched_job *job, + struct xe_range_fence_tree *rftree, + u64 start, u64 last) +{ + struct xe_range_fence *rtfence; + struct dma_fence *fence; + int err; + + rtfence = xe_range_fence_tree_first(rftree, start, last); + while (rtfence) { + fence = rtfence->fence; + + if (!dma_fence_is_signaled(fence)) { + /* + * Is this a CPU update? GPU is busy updating, so return + * an error + */ + if (!job) + return -ETIME; + + dma_fence_get(fence); + err = drm_sched_job_add_dependency(&job->drm, fence); + if (err) + return err; + } + + rtfence = xe_range_fence_tree_next(rtfence, start, last); + } + + return 0; +} + +static int xe_pt_pre_commit(struct xe_migrate_pt_update *pt_update) +{ + struct xe_range_fence_tree *rftree = + &xe_vma_vm(pt_update->vma)->rftree[pt_update->tile_id]; + + return xe_pt_vm_dependencies(pt_update->job, rftree, + pt_update->start, pt_update->last); +} + +static int xe_pt_userptr_pre_commit(struct xe_migrate_pt_update *pt_update) +{ + struct xe_pt_migrate_pt_update *userptr_update = + container_of(pt_update, typeof(*userptr_update), base); + struct xe_vma *vma = pt_update->vma; + unsigned long notifier_seq = vma->userptr.notifier_seq; + struct xe_vm *vm = xe_vma_vm(vma); + int err = xe_pt_vm_dependencies(pt_update->job, + &vm->rftree[pt_update->tile_id], + pt_update->start, + pt_update->last); + + if (err) + return err; + + userptr_update->locked = false; + + /* + * Wait until nobody is running the invalidation notifier, and + * since we're exiting the loop holding the notifier lock, + * nobody can proceed invalidating either. + * + * Note that we don't update the vma->userptr.notifier_seq since + * we don't update the userptr pages. + */ + do { + down_read(&vm->userptr.notifier_lock); + if (!mmu_interval_read_retry(&vma->userptr.notifier, + notifier_seq)) + break; + + up_read(&vm->userptr.notifier_lock); + + if (userptr_update->bind) + return -EAGAIN; + + notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier); + } while (true); + + /* Inject errors to test_whether they are handled correctly */ + if (userptr_update->bind && xe_pt_userptr_inject_eagain(vma)) { + up_read(&vm->userptr.notifier_lock); + return -EAGAIN; + } + + userptr_update->locked = true; + + return 0; +} + +static const struct xe_migrate_pt_update_ops bind_ops = { + .populate = xe_vm_populate_pgtable, + .pre_commit = xe_pt_pre_commit, +}; + +static const struct xe_migrate_pt_update_ops userptr_bind_ops = { + .populate = xe_vm_populate_pgtable, + .pre_commit = xe_pt_userptr_pre_commit, +}; + +struct invalidation_fence { + struct xe_gt_tlb_invalidation_fence base; + struct xe_gt *gt; + struct xe_vma *vma; + struct dma_fence *fence; + struct dma_fence_cb cb; + struct work_struct work; +}; + +static const char * +invalidation_fence_get_driver_name(struct dma_fence *dma_fence) +{ + return "xe"; +} + +static const char * +invalidation_fence_get_timeline_name(struct dma_fence *dma_fence) +{ + return "invalidation_fence"; +} + +static const struct dma_fence_ops invalidation_fence_ops = { + .get_driver_name = invalidation_fence_get_driver_name, + .get_timeline_name = invalidation_fence_get_timeline_name, +}; + +static void invalidation_fence_cb(struct dma_fence *fence, + struct dma_fence_cb *cb) +{ + struct invalidation_fence *ifence = + container_of(cb, struct invalidation_fence, cb); + + trace_xe_gt_tlb_invalidation_fence_cb(&ifence->base); + if (!ifence->fence->error) { + queue_work(system_wq, &ifence->work); + } else { + ifence->base.base.error = ifence->fence->error; + dma_fence_signal(&ifence->base.base); + dma_fence_put(&ifence->base.base); + } + dma_fence_put(ifence->fence); +} + +static void invalidation_fence_work_func(struct work_struct *w) +{ + struct invalidation_fence *ifence = + container_of(w, struct invalidation_fence, work); + + trace_xe_gt_tlb_invalidation_fence_work_func(&ifence->base); + xe_gt_tlb_invalidation_vma(ifence->gt, &ifence->base, ifence->vma); +} + +static int invalidation_fence_init(struct xe_gt *gt, + struct invalidation_fence *ifence, + struct dma_fence *fence, + struct xe_vma *vma) +{ + int ret; + + trace_xe_gt_tlb_invalidation_fence_create(&ifence->base); + + spin_lock_irq(>->tlb_invalidation.lock); + dma_fence_init(&ifence->base.base, &invalidation_fence_ops, + >->tlb_invalidation.lock, + gt->tlb_invalidation.fence_context, + ++gt->tlb_invalidation.fence_seqno); + spin_unlock_irq(>->tlb_invalidation.lock); + + INIT_LIST_HEAD(&ifence->base.link); + + dma_fence_get(&ifence->base.base); /* Ref for caller */ + ifence->fence = fence; + ifence->gt = gt; + ifence->vma = vma; + + INIT_WORK(&ifence->work, invalidation_fence_work_func); + ret = dma_fence_add_callback(fence, &ifence->cb, invalidation_fence_cb); + if (ret == -ENOENT) { + dma_fence_put(ifence->fence); /* Usually dropped in CB */ + invalidation_fence_work_func(&ifence->work); + } else if (ret) { + dma_fence_put(&ifence->base.base); /* Caller ref */ + dma_fence_put(&ifence->base.base); /* Creation ref */ + } + + xe_gt_assert(gt, !ret || ret == -ENOENT); + + return ret && ret != -ENOENT ? ret : 0; +} + +static void xe_pt_calc_rfence_interval(struct xe_vma *vma, + struct xe_pt_migrate_pt_update *update, + struct xe_vm_pgtable_update *entries, + u32 num_entries) +{ + int i, level = 0; + + for (i = 0; i < num_entries; i++) { + const struct xe_vm_pgtable_update *entry = &entries[i]; + + if (entry->pt->level > level) + level = entry->pt->level; + } + + /* Greedy (non-optimal) calculation but simple */ + update->base.start = ALIGN_DOWN(xe_vma_start(vma), + 0x1ull << xe_pt_shift(level)); + update->base.last = ALIGN(xe_vma_end(vma), + 0x1ull << xe_pt_shift(level)) - 1; +} + +/** + * __xe_pt_bind_vma() - Build and connect a page-table tree for the vma + * address range. + * @tile: The tile to bind for. + * @vma: The vma to bind. + * @q: The exec_queue with which to do pipelined page-table updates. + * @syncs: Entries to sync on before binding the built tree to the live vm tree. + * @num_syncs: Number of @sync entries. + * @rebind: Whether we're rebinding this vma to the same address range without + * an unbind in-between. + * + * This function builds a page-table tree (see xe_pt_stage_bind() for more + * information on page-table building), and the xe_vm_pgtable_update entries + * abstracting the operations needed to attach it to the main vm tree. It + * then takes the relevant locks and updates the metadata side of the main + * vm tree and submits the operations for pipelined attachment of the + * gpu page-table to the vm main tree, (which can be done either by the + * cpu and the GPU). + * + * Return: A valid dma-fence representing the pipelined attachment operation + * on success, an error pointer on error. + */ +struct dma_fence * +__xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue *q, + struct xe_sync_entry *syncs, u32 num_syncs, + bool rebind) +{ + struct xe_vm_pgtable_update entries[XE_VM_MAX_LEVEL * 2 + 1]; + struct xe_pt_migrate_pt_update bind_pt_update = { + .base = { + .ops = xe_vma_is_userptr(vma) ? &userptr_bind_ops : &bind_ops, + .vma = vma, + .tile_id = tile->id, + }, + .bind = true, + }; + struct xe_vm *vm = xe_vma_vm(vma); + u32 num_entries; + struct dma_fence *fence; + struct invalidation_fence *ifence = NULL; + struct xe_range_fence *rfence; + int err; + + bind_pt_update.locked = false; + xe_bo_assert_held(xe_vma_bo(vma)); + xe_vm_assert_held(vm); + + vm_dbg(&xe_vma_vm(vma)->xe->drm, + "Preparing bind, with range [%llx...%llx) engine %p.\n", + xe_vma_start(vma), xe_vma_end(vma), q); + + err = xe_pt_prepare_bind(tile, vma, entries, &num_entries, rebind); + if (err) + goto err; + xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries)); + + xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries); + xe_pt_calc_rfence_interval(vma, &bind_pt_update, entries, + num_entries); + + /* + * If rebind, we have to invalidate TLB on !LR vms to invalidate + * cached PTEs point to freed memory. on LR vms this is done + * automatically when the context is re-enabled by the rebind worker, + * or in fault mode it was invalidated on PTE zapping. + * + * If !rebind, and scratch enabled VMs, there is a chance the scratch + * PTE is already cached in the TLB so it needs to be invalidated. + * on !LR VMs this is done in the ring ops preceding a batch, but on + * non-faulting LR, in particular on user-space batch buffer chaining, + * it needs to be done here. + */ + if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) || + (!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) { + ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); + if (!ifence) + return ERR_PTR(-ENOMEM); + } + + rfence = kzalloc(sizeof(*rfence), GFP_KERNEL); + if (!rfence) { + kfree(ifence); + return ERR_PTR(-ENOMEM); + } + + fence = xe_migrate_update_pgtables(tile->migrate, + vm, xe_vma_bo(vma), q, + entries, num_entries, + syncs, num_syncs, + &bind_pt_update.base); + if (!IS_ERR(fence)) { + bool last_munmap_rebind = vma->gpuva.flags & XE_VMA_LAST_REBIND; + LLIST_HEAD(deferred); + int err; + + err = xe_range_fence_insert(&vm->rftree[tile->id], rfence, + &xe_range_fence_kfree_ops, + bind_pt_update.base.start, + bind_pt_update.base.last, fence); + if (err) + dma_fence_wait(fence, false); + + /* TLB invalidation must be done before signaling rebind */ + if (ifence) { + int err = invalidation_fence_init(tile->primary_gt, ifence, fence, + vma); + if (err) { + dma_fence_put(fence); + kfree(ifence); + return ERR_PTR(err); + } + fence = &ifence->base.base; + } + + /* add shared fence now for pagetable delayed destroy */ + dma_resv_add_fence(xe_vm_resv(vm), fence, !rebind && + last_munmap_rebind ? + DMA_RESV_USAGE_KERNEL : + DMA_RESV_USAGE_BOOKKEEP); + + if (!xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm) + dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence, + DMA_RESV_USAGE_BOOKKEEP); + xe_pt_commit_bind(vma, entries, num_entries, rebind, + bind_pt_update.locked ? &deferred : NULL); + + /* This vma is live (again?) now */ + vma->tile_present |= BIT(tile->id); + + if (bind_pt_update.locked) { + vma->userptr.initial_bind = true; + up_read(&vm->userptr.notifier_lock); + xe_bo_put_commit(&deferred); + } + if (!rebind && last_munmap_rebind && + xe_vm_in_preempt_fence_mode(vm)) + xe_vm_queue_rebind_worker(vm); + } else { + kfree(rfence); + kfree(ifence); + if (bind_pt_update.locked) + up_read(&vm->userptr.notifier_lock); + xe_pt_abort_bind(vma, entries, num_entries); + } + + return fence; + +err: + return ERR_PTR(err); +} + +struct xe_pt_stage_unbind_walk { + /** @base: The pagewalk base-class. */ + struct xe_pt_walk base; + + /* Input parameters for the walk */ + /** @tile: The tile we're unbinding from. */ + struct xe_tile *tile; + + /** + * @modified_start: Walk range start, modified to include any + * shared pagetables that we're the only user of and can thus + * treat as private. + */ + u64 modified_start; + /** @modified_end: Walk range start, modified like @modified_start. */ + u64 modified_end; + + /* Output */ + /* @wupd: Structure to track the page-table updates we're building */ + struct xe_walk_update wupd; +}; + +/* + * Check whether this range is the only one populating this pagetable, + * and in that case, update the walk range checks so that higher levels don't + * view us as a shared pagetable. + */ +static bool xe_pt_check_kill(u64 addr, u64 next, unsigned int level, + const struct xe_pt *child, + enum page_walk_action *action, + struct xe_pt_walk *walk) +{ + struct xe_pt_stage_unbind_walk *xe_walk = + container_of(walk, typeof(*xe_walk), base); + unsigned int shift = walk->shifts[level]; + u64 size = 1ull << shift; + + if (IS_ALIGNED(addr, size) && IS_ALIGNED(next, size) && + ((next - addr) >> shift) == child->num_live) { + u64 size = 1ull << walk->shifts[level + 1]; + + *action = ACTION_CONTINUE; + + if (xe_walk->modified_start >= addr) + xe_walk->modified_start = round_down(addr, size); + if (xe_walk->modified_end <= next) + xe_walk->modified_end = round_up(next, size); + + return true; + } + + return false; +} + +static int xe_pt_stage_unbind_entry(struct xe_ptw *parent, pgoff_t offset, + unsigned int level, u64 addr, u64 next, + struct xe_ptw **child, + enum page_walk_action *action, + struct xe_pt_walk *walk) +{ + struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base); + + XE_WARN_ON(!*child); + XE_WARN_ON(!level && xe_child->is_compact); + + xe_pt_check_kill(addr, next, level - 1, xe_child, action, walk); + + return 0; +} + +static int +xe_pt_stage_unbind_post_descend(struct xe_ptw *parent, pgoff_t offset, + unsigned int level, u64 addr, u64 next, + struct xe_ptw **child, + enum page_walk_action *action, + struct xe_pt_walk *walk) +{ + struct xe_pt_stage_unbind_walk *xe_walk = + container_of(walk, typeof(*xe_walk), base); + struct xe_pt *xe_child = container_of(*child, typeof(*xe_child), base); + pgoff_t end_offset; + u64 size = 1ull << walk->shifts[--level]; + + if (!IS_ALIGNED(addr, size)) + addr = xe_walk->modified_start; + if (!IS_ALIGNED(next, size)) + next = xe_walk->modified_end; + + /* Parent == *child is the root pt. Don't kill it. */ + if (parent != *child && + xe_pt_check_kill(addr, next, level, xe_child, action, walk)) + return 0; + + if (!xe_pt_nonshared_offsets(addr, next, level, walk, action, &offset, + &end_offset)) + return 0; + + (void)xe_pt_new_shared(&xe_walk->wupd, xe_child, offset, false); + xe_walk->wupd.updates[level].update->qwords = end_offset - offset; + + return 0; +} + +static const struct xe_pt_walk_ops xe_pt_stage_unbind_ops = { + .pt_entry = xe_pt_stage_unbind_entry, + .pt_post_descend = xe_pt_stage_unbind_post_descend, +}; + +/** + * xe_pt_stage_unbind() - Build page-table update structures for an unbind + * operation + * @tile: The tile we're unbinding for. + * @vma: The vma we're unbinding. + * @entries: Caller-provided storage for the update structures. + * + * Builds page-table update structures for an unbind operation. The function + * will attempt to remove all page-tables that we're the only user + * of, and for that to work, the unbind operation must be committed in the + * same critical section that blocks racing binds to the same page-table tree. + * + * Return: The number of entries used. + */ +static unsigned int xe_pt_stage_unbind(struct xe_tile *tile, struct xe_vma *vma, + struct xe_vm_pgtable_update *entries) +{ + struct xe_pt_stage_unbind_walk xe_walk = { + .base = { + .ops = &xe_pt_stage_unbind_ops, + .shifts = xe_normal_pt_shifts, + .max_level = XE_PT_HIGHEST_LEVEL, + }, + .tile = tile, + .modified_start = xe_vma_start(vma), + .modified_end = xe_vma_end(vma), + .wupd.entries = entries, + }; + struct xe_pt *pt = xe_vma_vm(vma)->pt_root[tile->id]; + + (void)xe_pt_walk_shared(&pt->base, pt->level, xe_vma_start(vma), + xe_vma_end(vma), &xe_walk.base); + + return xe_walk.wupd.num_used_entries; +} + +static void +xe_migrate_clear_pgtable_callback(struct xe_migrate_pt_update *pt_update, + struct xe_tile *tile, struct iosys_map *map, + void *ptr, u32 qword_ofs, u32 num_qwords, + const struct xe_vm_pgtable_update *update) +{ + struct xe_vma *vma = pt_update->vma; + u64 empty = __xe_pt_empty_pte(tile, xe_vma_vm(vma), update->pt->level); + int i; + + if (map && map->is_iomem) + for (i = 0; i < num_qwords; ++i) + xe_map_wr(tile_to_xe(tile), map, (qword_ofs + i) * + sizeof(u64), u64, empty); + else if (map) + memset64(map->vaddr + qword_ofs * sizeof(u64), empty, + num_qwords); + else + memset64(ptr, empty, num_qwords); +} + +static void +xe_pt_commit_unbind(struct xe_vma *vma, + struct xe_vm_pgtable_update *entries, u32 num_entries, + struct llist_head *deferred) +{ + u32 j; + + xe_pt_commit_locks_assert(vma); + + for (j = 0; j < num_entries; ++j) { + struct xe_vm_pgtable_update *entry = &entries[j]; + struct xe_pt *pt = entry->pt; + + pt->num_live -= entry->qwords; + if (pt->level) { + struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt); + u32 i; + + for (i = entry->ofs; i < entry->ofs + entry->qwords; + i++) { + if (xe_pt_entry(pt_dir, i)) + xe_pt_destroy(xe_pt_entry(pt_dir, i), + xe_vma_vm(vma)->flags, deferred); + + pt_dir->dir.entries[i] = NULL; + } + } + } +} + +static const struct xe_migrate_pt_update_ops unbind_ops = { + .populate = xe_migrate_clear_pgtable_callback, + .pre_commit = xe_pt_pre_commit, +}; + +static const struct xe_migrate_pt_update_ops userptr_unbind_ops = { + .populate = xe_migrate_clear_pgtable_callback, + .pre_commit = xe_pt_userptr_pre_commit, +}; + +/** + * __xe_pt_unbind_vma() - Disconnect and free a page-table tree for the vma + * address range. + * @tile: The tile to unbind for. + * @vma: The vma to unbind. + * @q: The exec_queue with which to do pipelined page-table updates. + * @syncs: Entries to sync on before disconnecting the tree to be destroyed. + * @num_syncs: Number of @sync entries. + * + * This function builds a the xe_vm_pgtable_update entries abstracting the + * operations needed to detach the page-table tree to be destroyed from the + * man vm tree. + * It then takes the relevant locks and submits the operations for + * pipelined detachment of the gpu page-table from the vm main tree, + * (which can be done either by the cpu and the GPU), Finally it frees the + * detached page-table tree. + * + * Return: A valid dma-fence representing the pipelined detachment operation + * on success, an error pointer on error. + */ +struct dma_fence * +__xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue *q, + struct xe_sync_entry *syncs, u32 num_syncs) +{ + struct xe_vm_pgtable_update entries[XE_VM_MAX_LEVEL * 2 + 1]; + struct xe_pt_migrate_pt_update unbind_pt_update = { + .base = { + .ops = xe_vma_is_userptr(vma) ? &userptr_unbind_ops : + &unbind_ops, + .vma = vma, + .tile_id = tile->id, + }, + }; + struct xe_vm *vm = xe_vma_vm(vma); + u32 num_entries; + struct dma_fence *fence = NULL; + struct invalidation_fence *ifence; + struct xe_range_fence *rfence; + + LLIST_HEAD(deferred); + + xe_bo_assert_held(xe_vma_bo(vma)); + xe_vm_assert_held(vm); + + vm_dbg(&xe_vma_vm(vma)->xe->drm, + "Preparing unbind, with range [%llx...%llx) engine %p.\n", + xe_vma_start(vma), xe_vma_end(vma), q); + + num_entries = xe_pt_stage_unbind(tile, vma, entries); + xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries)); + + xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries); + xe_pt_calc_rfence_interval(vma, &unbind_pt_update, entries, + num_entries); + + ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); + if (!ifence) + return ERR_PTR(-ENOMEM); + + rfence = kzalloc(sizeof(*rfence), GFP_KERNEL); + if (!rfence) { + kfree(ifence); + return ERR_PTR(-ENOMEM); + } + + /* + * Even if we were already evicted and unbind to destroy, we need to + * clear again here. The eviction may have updated pagetables at a + * lower level, because it needs to be more conservative. + */ + fence = xe_migrate_update_pgtables(tile->migrate, + vm, NULL, q ? q : + vm->q[tile->id], + entries, num_entries, + syncs, num_syncs, + &unbind_pt_update.base); + if (!IS_ERR(fence)) { + int err; + + err = xe_range_fence_insert(&vm->rftree[tile->id], rfence, + &xe_range_fence_kfree_ops, + unbind_pt_update.base.start, + unbind_pt_update.base.last, fence); + if (err) + dma_fence_wait(fence, false); + + /* TLB invalidation must be done before signaling unbind */ + err = invalidation_fence_init(tile->primary_gt, ifence, fence, vma); + if (err) { + dma_fence_put(fence); + kfree(ifence); + return ERR_PTR(err); + } + fence = &ifence->base.base; + + /* add shared fence now for pagetable delayed destroy */ + dma_resv_add_fence(xe_vm_resv(vm), fence, + DMA_RESV_USAGE_BOOKKEEP); + + /* This fence will be installed by caller when doing eviction */ + if (!xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm) + dma_resv_add_fence(xe_vma_bo(vma)->ttm.base.resv, fence, + DMA_RESV_USAGE_BOOKKEEP); + xe_pt_commit_unbind(vma, entries, num_entries, + unbind_pt_update.locked ? &deferred : NULL); + vma->tile_present &= ~BIT(tile->id); + } else { + kfree(rfence); + kfree(ifence); + } + + if (!vma->tile_present) + list_del_init(&vma->combined_links.rebind); + + if (unbind_pt_update.locked) { + xe_tile_assert(tile, xe_vma_is_userptr(vma)); + + if (!vma->tile_present) { + spin_lock(&vm->userptr.invalidated_lock); + list_del_init(&vma->userptr.invalidate_link); + spin_unlock(&vm->userptr.invalidated_lock); + } + up_read(&vm->userptr.notifier_lock); + xe_bo_put_commit(&deferred); + } + + return fence; +} diff --git a/drivers/gpu/drm/xe/xe_pt.h b/drivers/gpu/drm/xe/xe_pt.h new file mode 100644 index 000000000000..71a4fbfcff43 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pt.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ +#ifndef _XE_PT_H_ +#define _XE_PT_H_ + +#include <linux/types.h> + +#include "xe_pt_types.h" + +struct dma_fence; +struct xe_bo; +struct xe_device; +struct xe_exec_queue; +struct xe_sync_entry; +struct xe_tile; +struct xe_vm; +struct xe_vma; + +/* Largest huge pte is currently 1GiB. May become device dependent. */ +#define MAX_HUGEPTE_LEVEL 2 + +#define xe_pt_write(xe, map, idx, data) \ + xe_map_wr(xe, map, (idx) * sizeof(u64), u64, data) + +unsigned int xe_pt_shift(unsigned int level); + +struct xe_pt *xe_pt_create(struct xe_vm *vm, struct xe_tile *tile, + unsigned int level); + +void xe_pt_populate_empty(struct xe_tile *tile, struct xe_vm *vm, + struct xe_pt *pt); + +void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head *deferred); + +struct dma_fence * +__xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue *q, + struct xe_sync_entry *syncs, u32 num_syncs, + bool rebind); + +struct dma_fence * +__xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue *q, + struct xe_sync_entry *syncs, u32 num_syncs); + +bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma); + +#endif diff --git a/drivers/gpu/drm/xe/xe_pt_types.h b/drivers/gpu/drm/xe/xe_pt_types.h new file mode 100644 index 000000000000..cee70cb0f014 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pt_types.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_PT_TYPES_H_ +#define _XE_PT_TYPES_H_ + +#include <linux/types.h> + +#include "xe_pt_walk.h" + +struct xe_bo; +struct xe_device; +struct xe_vma; + +enum xe_cache_level { + XE_CACHE_NONE, + XE_CACHE_WT, + XE_CACHE_WB, + XE_CACHE_NONE_COMPRESSION, /*UC + COH_NONE + COMPRESSION */ + __XE_CACHE_LEVEL_COUNT, +}; + +#define XE_VM_MAX_LEVEL 4 + +struct xe_pt { + struct xe_ptw base; + struct xe_bo *bo; + unsigned int level; + unsigned int num_live; + bool rebind; + bool is_compact; +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM) + /** addr: Virtual address start address of the PT. */ + u64 addr; +#endif +}; + +struct xe_pt_ops { + u64 (*pte_encode_bo)(struct xe_bo *bo, u64 bo_offset, + u16 pat_index, u32 pt_level); + u64 (*pte_encode_vma)(u64 pte, struct xe_vma *vma, + u16 pat_index, u32 pt_level); + u64 (*pte_encode_addr)(struct xe_device *xe, u64 addr, + u16 pat_index, + u32 pt_level, bool devmem, u64 flags); + u64 (*pde_encode_bo)(struct xe_bo *bo, u64 bo_offset, + u16 pat_index); +}; + +struct xe_pt_entry { + struct xe_pt *pt; + u64 pte; +}; + +struct xe_vm_pgtable_update { + /** @bo: page table bo to write to */ + struct xe_bo *pt_bo; + + /** @ofs: offset inside this PTE to begin writing to (in qwords) */ + u32 ofs; + + /** @qwords: number of PTE's to write */ + u32 qwords; + + /** @pt: opaque pointer useful for the caller of xe_migrate_update_pgtables */ + struct xe_pt *pt; + + /** @pt_entries: Newly added pagetable entries */ + struct xe_pt_entry *pt_entries; + + /** @flags: Target flags */ + u32 flags; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_pt_walk.c b/drivers/gpu/drm/xe/xe_pt_walk.c new file mode 100644 index 000000000000..8f6c8d063f39 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pt_walk.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2022 Intel Corporation + */ +#include "xe_pt_walk.h" + +/** + * DOC: GPU page-table tree walking. + * The utilities in this file are similar to the CPU page-table walk + * utilities in mm/pagewalk.c. The main difference is that we distinguish + * the various levels of a page-table tree with an unsigned integer rather + * than by name. 0 is the lowest level, and page-tables with level 0 can + * not be directories pointing to lower levels, whereas all other levels + * can. The user of the utilities determines the highest level. + * + * Nomenclature: + * Each struct xe_ptw, regardless of level is referred to as a page table, and + * multiple page tables typically form a page table tree with page tables at + * intermediate levels being page directories pointing at page tables at lower + * levels. A shared page table for a given address range is a page-table which + * is neither fully within nor fully outside the address range and that can + * thus be shared by two or more address ranges. + * + * Please keep this code generic so that it can used as a drm-wide page- + * table walker should other drivers find use for it. + */ +static u64 xe_pt_addr_end(u64 addr, u64 end, unsigned int level, + const struct xe_pt_walk *walk) +{ + u64 size = 1ull << walk->shifts[level]; + u64 tmp = round_up(addr + 1, size); + + return min_t(u64, tmp, end); +} + +static bool xe_pt_next(pgoff_t *offset, u64 *addr, u64 next, u64 end, + unsigned int level, const struct xe_pt_walk *walk) +{ + pgoff_t step = 1; + + /* Shared pt walk skips to the last pagetable */ + if (unlikely(walk->shared_pt_mode)) { + unsigned int shift = walk->shifts[level]; + u64 skip_to = round_down(end, 1ull << shift); + + if (skip_to > next) { + step += (skip_to - next) >> shift; + next = skip_to; + } + } + + *addr = next; + *offset += step; + + return next != end; +} + +/** + * xe_pt_walk_range() - Walk a range of a gpu page table tree with callbacks + * for each page-table entry in all levels. + * @parent: The root page table for walk start. + * @level: The root page table level. + * @addr: Virtual address start. + * @end: Virtual address end + 1. + * @walk: Walk info. + * + * Similar to the CPU page-table walker, this is a helper to walk + * a gpu page table and call a provided callback function for each entry. + * + * Return: 0 on success, negative error code on error. The error is + * propagated from the callback and on error the walk is terminated. + */ +int xe_pt_walk_range(struct xe_ptw *parent, unsigned int level, + u64 addr, u64 end, struct xe_pt_walk *walk) +{ + pgoff_t offset = xe_pt_offset(addr, level, walk); + struct xe_ptw **entries = parent->dir ? parent->dir->entries : NULL; + const struct xe_pt_walk_ops *ops = walk->ops; + enum page_walk_action action; + struct xe_ptw *child; + int err = 0; + u64 next; + + do { + next = xe_pt_addr_end(addr, end, level, walk); + if (walk->shared_pt_mode && xe_pt_covers(addr, next, level, + walk)) + continue; +again: + action = ACTION_SUBTREE; + child = entries ? entries[offset] : NULL; + err = ops->pt_entry(parent, offset, level, addr, next, + &child, &action, walk); + if (err) + break; + + /* Probably not needed yet for gpu pagetable walk. */ + if (unlikely(action == ACTION_AGAIN)) + goto again; + + if (likely(!level || !child || action == ACTION_CONTINUE)) + continue; + + err = xe_pt_walk_range(child, level - 1, addr, next, walk); + + if (!err && ops->pt_post_descend) + err = ops->pt_post_descend(parent, offset, level, addr, + next, &child, &action, walk); + if (err) + break; + + } while (xe_pt_next(&offset, &addr, next, end, level, walk)); + + return err; +} + +/** + * xe_pt_walk_shared() - Walk shared page tables of a page-table tree. + * @parent: Root page table directory. + * @level: Level of the root. + * @addr: Start address. + * @end: Last address + 1. + * @walk: Walk info. + * + * This function is similar to xe_pt_walk_range() but it skips page tables + * that are private to the range. Since the root (or @parent) page table is + * typically also a shared page table this function is different in that it + * calls the pt_entry callback and the post_descend callback also for the + * root. The root can be detected in the callbacks by checking whether + * parent == *child. + * Walking only the shared page tables is common for unbind-type operations + * where the page-table entries for an address range are cleared or detached + * from the main page-table tree. + * + * Return: 0 on success, negative error code on error: If a callback + * returns an error, the walk will be terminated and the error returned by + * this function. + */ +int xe_pt_walk_shared(struct xe_ptw *parent, unsigned int level, + u64 addr, u64 end, struct xe_pt_walk *walk) +{ + const struct xe_pt_walk_ops *ops = walk->ops; + enum page_walk_action action = ACTION_SUBTREE; + struct xe_ptw *child = parent; + int err; + + walk->shared_pt_mode = true; + err = walk->ops->pt_entry(parent, 0, level + 1, addr, end, + &child, &action, walk); + + if (err || action != ACTION_SUBTREE) + return err; + + err = xe_pt_walk_range(parent, level, addr, end, walk); + if (!err && ops->pt_post_descend) { + err = ops->pt_post_descend(parent, 0, level + 1, addr, end, + &child, &action, walk); + } + return err; +} diff --git a/drivers/gpu/drm/xe/xe_pt_walk.h b/drivers/gpu/drm/xe/xe_pt_walk.h new file mode 100644 index 000000000000..ec3d1e9efa6d --- /dev/null +++ b/drivers/gpu/drm/xe/xe_pt_walk.h @@ -0,0 +1,161 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright © 2022 Intel Corporation + */ +#ifndef __XE_PT_WALK__ +#define __XE_PT_WALK__ + +#include <linux/pagewalk.h> +#include <linux/types.h> + +struct xe_ptw_dir; + +/** + * struct xe_ptw - base class for driver pagetable subclassing. + * @dir: Pointer to an array of children if any. + * + * Drivers could subclass this, and if it's a page-directory, typically + * embed the xe_ptw_dir::entries array in the same allocation. + */ +struct xe_ptw { + struct xe_ptw_dir *dir; +}; + +/** + * struct xe_ptw_dir - page directory structure + * @entries: Array holding page directory children. + * + * It is the responsibility of the user to ensure @entries is + * correctly sized. + */ +struct xe_ptw_dir { + struct xe_ptw *entries[0]; +}; + +/** + * struct xe_pt_walk - Embeddable struct for walk parameters + */ +struct xe_pt_walk { + /** @ops: The walk ops used for the pagewalk */ + const struct xe_pt_walk_ops *ops; + /** + * @shifts: Array of page-table entry shifts used for the + * different levels, starting out with the leaf level 0 + * page-shift as the first entry. It's legal for this pointer to be + * changed during the walk. + */ + const u64 *shifts; + /** @max_level: Highest populated level in @sizes */ + unsigned int max_level; + /** + * @shared_pt_mode: Whether to skip all entries that are private + * to the address range and called only for entries that are + * shared with other address ranges. Such entries are referred to + * as shared pagetables. + */ + bool shared_pt_mode; +}; + +/** + * typedef xe_pt_entry_fn - gpu page-table-walk callback-function + * @parent: The parent page.table. + * @offset: The offset (number of entries) into the page table. + * @level: The level of @parent. + * @addr: The virtual address. + * @next: The virtual address for the next call, or end address. + * @child: Pointer to pointer to child page-table at this @offset. The + * function may modify the value pointed to if, for example, allocating a + * child page table. + * @action: The walk action to take upon return. See <linux/pagewalk.h>. + * @walk: The walk parameters. + */ +typedef int (*xe_pt_entry_fn)(struct xe_ptw *parent, pgoff_t offset, + unsigned int level, u64 addr, u64 next, + struct xe_ptw **child, + enum page_walk_action *action, + struct xe_pt_walk *walk); + +/** + * struct xe_pt_walk_ops - Walk callbacks. + */ +struct xe_pt_walk_ops { + /** + * @pt_entry: Callback to be called for each page table entry prior + * to descending to the next level. The returned value of the action + * function parameter is honored. + */ + xe_pt_entry_fn pt_entry; + /** + * @pt_post_descend: Callback to be called for each page table entry + * after return from descending to the next level. The returned value + * of the action function parameter is ignored. + */ + xe_pt_entry_fn pt_post_descend; +}; + +int xe_pt_walk_range(struct xe_ptw *parent, unsigned int level, + u64 addr, u64 end, struct xe_pt_walk *walk); + +int xe_pt_walk_shared(struct xe_ptw *parent, unsigned int level, + u64 addr, u64 end, struct xe_pt_walk *walk); + +/** + * xe_pt_covers - Whether the address range covers an entire entry in @level + * @addr: Start of the range. + * @end: End of range + 1. + * @level: Page table level. + * @walk: Page table walk info. + * + * This function is a helper to aid in determining whether a leaf page table + * entry can be inserted at this @level. + * + * Return: Whether the range provided covers exactly an entry at this level. + */ +static inline bool xe_pt_covers(u64 addr, u64 end, unsigned int level, + const struct xe_pt_walk *walk) +{ + u64 pt_size = 1ull << walk->shifts[level]; + + return end - addr == pt_size && IS_ALIGNED(addr, pt_size); +} + +/** + * xe_pt_num_entries: Number of page-table entries of a given range at this + * level + * @addr: Start address. + * @end: End address. + * @level: Page table level. + * @walk: Walk info. + * + * Return: The number of page table entries at this level between @start and + * @end. + */ +static inline pgoff_t +xe_pt_num_entries(u64 addr, u64 end, unsigned int level, + const struct xe_pt_walk *walk) +{ + u64 pt_size = 1ull << walk->shifts[level]; + + return (round_up(end, pt_size) - round_down(addr, pt_size)) >> + walk->shifts[level]; +} + +/** + * xe_pt_offset: Offset of the page-table entry for a given address. + * @addr: The address. + * @level: Page table level. + * @walk: Walk info. + * + * Return: The page table entry offset for the given address in a + * page table with size indicated by @level. + */ +static inline pgoff_t +xe_pt_offset(u64 addr, unsigned int level, const struct xe_pt_walk *walk) +{ + if (level < walk->max_level) + addr &= ((1ull << walk->shifts[level + 1]) - 1); + + return addr >> walk->shifts[level]; +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c new file mode 100644 index 000000000000..9b35673b286c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_query.c @@ -0,0 +1,552 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_query.h" + +#include <linux/nospec.h> +#include <linux/sched/clock.h> + +#include <drm/ttm/ttm_placement.h> +#include <drm/xe_drm.h> + +#include "regs/xe_engine_regs.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_exec_queue.h" +#include "xe_ggtt.h" +#include "xe_gt.h" +#include "xe_guc_hwconfig.h" +#include "xe_macros.h" +#include "xe_mmio.h" +#include "xe_ttm_vram_mgr.h" + +static const u16 xe_to_user_engine_class[] = { + [XE_ENGINE_CLASS_RENDER] = DRM_XE_ENGINE_CLASS_RENDER, + [XE_ENGINE_CLASS_COPY] = DRM_XE_ENGINE_CLASS_COPY, + [XE_ENGINE_CLASS_VIDEO_DECODE] = DRM_XE_ENGINE_CLASS_VIDEO_DECODE, + [XE_ENGINE_CLASS_VIDEO_ENHANCE] = DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE, + [XE_ENGINE_CLASS_COMPUTE] = DRM_XE_ENGINE_CLASS_COMPUTE, +}; + +static const enum xe_engine_class user_to_xe_engine_class[] = { + [DRM_XE_ENGINE_CLASS_RENDER] = XE_ENGINE_CLASS_RENDER, + [DRM_XE_ENGINE_CLASS_COPY] = XE_ENGINE_CLASS_COPY, + [DRM_XE_ENGINE_CLASS_VIDEO_DECODE] = XE_ENGINE_CLASS_VIDEO_DECODE, + [DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE] = XE_ENGINE_CLASS_VIDEO_ENHANCE, + [DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE, +}; + +static size_t calc_hw_engine_info_size(struct xe_device *xe) +{ + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + struct xe_gt *gt; + u8 gt_id; + int i = 0; + + for_each_gt(gt, xe, gt_id) + for_each_hw_engine(hwe, gt, id) { + if (xe_hw_engine_is_reserved(hwe)) + continue; + i++; + } + + return sizeof(struct drm_xe_query_engines) + + i * sizeof(struct drm_xe_engine); +} + +typedef u64 (*__ktime_func_t)(void); +static __ktime_func_t __clock_id_to_func(clockid_t clk_id) +{ + /* + * Use logic same as the perf subsystem to allow user to select the + * reference clock id to be used for timestamps. + */ + switch (clk_id) { + case CLOCK_MONOTONIC: + return &ktime_get_ns; + case CLOCK_MONOTONIC_RAW: + return &ktime_get_raw_ns; + case CLOCK_REALTIME: + return &ktime_get_real_ns; + case CLOCK_BOOTTIME: + return &ktime_get_boottime_ns; + case CLOCK_TAI: + return &ktime_get_clocktai_ns; + default: + return NULL; + } +} + +static void +__read_timestamps(struct xe_gt *gt, + struct xe_reg lower_reg, + struct xe_reg upper_reg, + u64 *engine_ts, + u64 *cpu_ts, + u64 *cpu_delta, + __ktime_func_t cpu_clock) +{ + u32 upper, lower, old_upper, loop = 0; + + upper = xe_mmio_read32(gt, upper_reg); + do { + *cpu_delta = local_clock(); + *cpu_ts = cpu_clock(); + lower = xe_mmio_read32(gt, lower_reg); + *cpu_delta = local_clock() - *cpu_delta; + old_upper = upper; + upper = xe_mmio_read32(gt, upper_reg); + } while (upper != old_upper && loop++ < 2); + + *engine_ts = (u64)upper << 32 | lower; +} + +static int +query_engine_cycles(struct xe_device *xe, + struct drm_xe_device_query *query) +{ + struct drm_xe_query_engine_cycles __user *query_ptr; + struct drm_xe_engine_class_instance *eci; + struct drm_xe_query_engine_cycles resp; + size_t size = sizeof(resp); + __ktime_func_t cpu_clock; + struct xe_hw_engine *hwe; + struct xe_gt *gt; + + if (query->size == 0) { + query->size = size; + return 0; + } else if (XE_IOCTL_DBG(xe, query->size != size)) { + return -EINVAL; + } + + query_ptr = u64_to_user_ptr(query->data); + if (copy_from_user(&resp, query_ptr, size)) + return -EFAULT; + + cpu_clock = __clock_id_to_func(resp.clockid); + if (!cpu_clock) + return -EINVAL; + + eci = &resp.eci; + if (eci->gt_id > XE_MAX_GT_PER_TILE) + return -EINVAL; + + gt = xe_device_get_gt(xe, eci->gt_id); + if (!gt) + return -EINVAL; + + if (eci->engine_class >= ARRAY_SIZE(user_to_xe_engine_class)) + return -EINVAL; + + hwe = xe_gt_hw_engine(gt, user_to_xe_engine_class[eci->engine_class], + eci->engine_instance, true); + if (!hwe) + return -EINVAL; + + xe_device_mem_access_get(xe); + xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL); + + __read_timestamps(gt, + RING_TIMESTAMP(hwe->mmio_base), + RING_TIMESTAMP_UDW(hwe->mmio_base), + &resp.engine_cycles, + &resp.cpu_timestamp, + &resp.cpu_delta, + cpu_clock); + + xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL); + xe_device_mem_access_put(xe); + resp.width = 36; + + /* Only write to the output fields of user query */ + if (put_user(resp.cpu_timestamp, &query_ptr->cpu_timestamp)) + return -EFAULT; + + if (put_user(resp.cpu_delta, &query_ptr->cpu_delta)) + return -EFAULT; + + if (put_user(resp.engine_cycles, &query_ptr->engine_cycles)) + return -EFAULT; + + if (put_user(resp.width, &query_ptr->width)) + return -EFAULT; + + return 0; +} + +static int query_engines(struct xe_device *xe, + struct drm_xe_device_query *query) +{ + size_t size = calc_hw_engine_info_size(xe); + struct drm_xe_query_engines __user *query_ptr = + u64_to_user_ptr(query->data); + struct drm_xe_query_engines *engines; + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + struct xe_gt *gt; + u8 gt_id; + int i = 0; + + if (query->size == 0) { + query->size = size; + return 0; + } else if (XE_IOCTL_DBG(xe, query->size != size)) { + return -EINVAL; + } + + engines = kmalloc(size, GFP_KERNEL); + if (!engines) + return -ENOMEM; + + for_each_gt(gt, xe, gt_id) + for_each_hw_engine(hwe, gt, id) { + if (xe_hw_engine_is_reserved(hwe)) + continue; + + engines->engines[i].instance.engine_class = + xe_to_user_engine_class[hwe->class]; + engines->engines[i].instance.engine_instance = + hwe->logical_instance; + engines->engines[i].instance.gt_id = gt->info.id; + engines->engines[i].instance.pad = 0; + memset(engines->engines[i].reserved, 0, + sizeof(engines->engines[i].reserved)); + + i++; + } + + engines->pad = 0; + engines->num_engines = i; + + if (copy_to_user(query_ptr, engines, size)) { + kfree(engines); + return -EFAULT; + } + kfree(engines); + + return 0; +} + +static size_t calc_mem_regions_size(struct xe_device *xe) +{ + u32 num_managers = 1; + int i; + + for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) + if (ttm_manager_type(&xe->ttm, i)) + num_managers++; + + return offsetof(struct drm_xe_query_mem_regions, mem_regions[num_managers]); +} + +static int query_mem_regions(struct xe_device *xe, + struct drm_xe_device_query *query) +{ + size_t size = calc_mem_regions_size(xe); + struct drm_xe_query_mem_regions *mem_regions; + struct drm_xe_query_mem_regions __user *query_ptr = + u64_to_user_ptr(query->data); + struct ttm_resource_manager *man; + int ret, i; + + if (query->size == 0) { + query->size = size; + return 0; + } else if (XE_IOCTL_DBG(xe, query->size != size)) { + return -EINVAL; + } + + mem_regions = kzalloc(size, GFP_KERNEL); + if (XE_IOCTL_DBG(xe, !mem_regions)) + return -ENOMEM; + + man = ttm_manager_type(&xe->ttm, XE_PL_TT); + mem_regions->mem_regions[0].mem_class = DRM_XE_MEM_REGION_CLASS_SYSMEM; + /* + * The instance needs to be a unique number that represents the index + * in the placement mask used at xe_gem_create_ioctl() for the + * xe_bo_create() placement. + */ + mem_regions->mem_regions[0].instance = 0; + mem_regions->mem_regions[0].min_page_size = PAGE_SIZE; + mem_regions->mem_regions[0].total_size = man->size << PAGE_SHIFT; + if (perfmon_capable()) + mem_regions->mem_regions[0].used = ttm_resource_manager_usage(man); + mem_regions->num_mem_regions = 1; + + for (i = XE_PL_VRAM0; i <= XE_PL_VRAM1; ++i) { + man = ttm_manager_type(&xe->ttm, i); + if (man) { + mem_regions->mem_regions[mem_regions->num_mem_regions].mem_class = + DRM_XE_MEM_REGION_CLASS_VRAM; + mem_regions->mem_regions[mem_regions->num_mem_regions].instance = + mem_regions->num_mem_regions; + mem_regions->mem_regions[mem_regions->num_mem_regions].min_page_size = + xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? + SZ_64K : PAGE_SIZE; + mem_regions->mem_regions[mem_regions->num_mem_regions].total_size = + man->size; + + if (perfmon_capable()) { + xe_ttm_vram_get_used(man, + &mem_regions->mem_regions + [mem_regions->num_mem_regions].used, + &mem_regions->mem_regions + [mem_regions->num_mem_regions].cpu_visible_used); + } + + mem_regions->mem_regions[mem_regions->num_mem_regions].cpu_visible_size = + xe_ttm_vram_get_cpu_visible_size(man); + mem_regions->num_mem_regions++; + } + } + + if (!copy_to_user(query_ptr, mem_regions, size)) + ret = 0; + else + ret = -ENOSPC; + + kfree(mem_regions); + return ret; +} + +static int query_config(struct xe_device *xe, struct drm_xe_device_query *query) +{ + const u32 num_params = DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY + 1; + size_t size = + sizeof(struct drm_xe_query_config) + num_params * sizeof(u64); + struct drm_xe_query_config __user *query_ptr = + u64_to_user_ptr(query->data); + struct drm_xe_query_config *config; + + if (query->size == 0) { + query->size = size; + return 0; + } else if (XE_IOCTL_DBG(xe, query->size != size)) { + return -EINVAL; + } + + config = kzalloc(size, GFP_KERNEL); + if (!config) + return -ENOMEM; + + config->num_params = num_params; + config->info[DRM_XE_QUERY_CONFIG_REV_AND_DEVICE_ID] = + xe->info.devid | (xe->info.revid << 16); + if (xe_device_get_root_tile(xe)->mem.vram.usable_size) + config->info[DRM_XE_QUERY_CONFIG_FLAGS] = + DRM_XE_QUERY_CONFIG_FLAG_HAS_VRAM; + config->info[DRM_XE_QUERY_CONFIG_MIN_ALIGNMENT] = + xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K; + config->info[DRM_XE_QUERY_CONFIG_VA_BITS] = xe->info.va_bits; + config->info[DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY] = + xe_exec_queue_device_get_max_priority(xe); + + if (copy_to_user(query_ptr, config, size)) { + kfree(config); + return -EFAULT; + } + kfree(config); + + return 0; +} + +static int query_gt_list(struct xe_device *xe, struct drm_xe_device_query *query) +{ + struct xe_gt *gt; + size_t size = sizeof(struct drm_xe_query_gt_list) + + xe->info.gt_count * sizeof(struct drm_xe_gt); + struct drm_xe_query_gt_list __user *query_ptr = + u64_to_user_ptr(query->data); + struct drm_xe_query_gt_list *gt_list; + u8 id; + + if (query->size == 0) { + query->size = size; + return 0; + } else if (XE_IOCTL_DBG(xe, query->size != size)) { + return -EINVAL; + } + + gt_list = kzalloc(size, GFP_KERNEL); + if (!gt_list) + return -ENOMEM; + + gt_list->num_gt = xe->info.gt_count; + + for_each_gt(gt, xe, id) { + if (xe_gt_is_media_type(gt)) + gt_list->gt_list[id].type = DRM_XE_QUERY_GT_TYPE_MEDIA; + else + gt_list->gt_list[id].type = DRM_XE_QUERY_GT_TYPE_MAIN; + gt_list->gt_list[id].tile_id = gt_to_tile(gt)->id; + gt_list->gt_list[id].gt_id = gt->info.id; + gt_list->gt_list[id].reference_clock = gt->info.reference_clock; + /* + * The mem_regions indexes in the mask below need to + * directly identify the struct + * drm_xe_query_mem_regions' instance constructed at + * query_mem_regions() + * + * For our current platforms: + * Bit 0 -> System Memory + * Bit 1 -> VRAM0 on Tile0 + * Bit 2 -> VRAM1 on Tile1 + * However the uAPI is generic and it's userspace's + * responsibility to check the mem_class, without any + * assumption. + */ + if (!IS_DGFX(xe)) + gt_list->gt_list[id].near_mem_regions = 0x1; + else + gt_list->gt_list[id].near_mem_regions = + BIT(gt_to_tile(gt)->id) << 1; + gt_list->gt_list[id].far_mem_regions = xe->info.mem_region_mask ^ + gt_list->gt_list[id].near_mem_regions; + } + + if (copy_to_user(query_ptr, gt_list, size)) { + kfree(gt_list); + return -EFAULT; + } + kfree(gt_list); + + return 0; +} + +static int query_hwconfig(struct xe_device *xe, + struct drm_xe_device_query *query) +{ + struct xe_gt *gt = xe_root_mmio_gt(xe); + size_t size = xe_guc_hwconfig_size(>->uc.guc); + void __user *query_ptr = u64_to_user_ptr(query->data); + void *hwconfig; + + if (query->size == 0) { + query->size = size; + return 0; + } else if (XE_IOCTL_DBG(xe, query->size != size)) { + return -EINVAL; + } + + hwconfig = kzalloc(size, GFP_KERNEL); + if (!hwconfig) + return -ENOMEM; + + xe_device_mem_access_get(xe); + xe_guc_hwconfig_copy(>->uc.guc, hwconfig); + xe_device_mem_access_put(xe); + + if (copy_to_user(query_ptr, hwconfig, size)) { + kfree(hwconfig); + return -EFAULT; + } + kfree(hwconfig); + + return 0; +} + +static size_t calc_topo_query_size(struct xe_device *xe) +{ + return xe->info.gt_count * + (3 * sizeof(struct drm_xe_query_topology_mask) + + sizeof_field(struct xe_gt, fuse_topo.g_dss_mask) + + sizeof_field(struct xe_gt, fuse_topo.c_dss_mask) + + sizeof_field(struct xe_gt, fuse_topo.eu_mask_per_dss)); +} + +static void __user *copy_mask(void __user *ptr, + struct drm_xe_query_topology_mask *topo, + void *mask, size_t mask_size) +{ + topo->num_bytes = mask_size; + + if (copy_to_user(ptr, topo, sizeof(*topo))) + return ERR_PTR(-EFAULT); + ptr += sizeof(topo); + + if (copy_to_user(ptr, mask, mask_size)) + return ERR_PTR(-EFAULT); + ptr += mask_size; + + return ptr; +} + +static int query_gt_topology(struct xe_device *xe, + struct drm_xe_device_query *query) +{ + void __user *query_ptr = u64_to_user_ptr(query->data); + size_t size = calc_topo_query_size(xe); + struct drm_xe_query_topology_mask topo; + struct xe_gt *gt; + int id; + + if (query->size == 0) { + query->size = size; + return 0; + } else if (XE_IOCTL_DBG(xe, query->size != size)) { + return -EINVAL; + } + + for_each_gt(gt, xe, id) { + topo.gt_id = id; + + topo.type = DRM_XE_TOPO_DSS_GEOMETRY; + query_ptr = copy_mask(query_ptr, &topo, + gt->fuse_topo.g_dss_mask, + sizeof(gt->fuse_topo.g_dss_mask)); + if (IS_ERR(query_ptr)) + return PTR_ERR(query_ptr); + + topo.type = DRM_XE_TOPO_DSS_COMPUTE; + query_ptr = copy_mask(query_ptr, &topo, + gt->fuse_topo.c_dss_mask, + sizeof(gt->fuse_topo.c_dss_mask)); + if (IS_ERR(query_ptr)) + return PTR_ERR(query_ptr); + + topo.type = DRM_XE_TOPO_EU_PER_DSS; + query_ptr = copy_mask(query_ptr, &topo, + gt->fuse_topo.eu_mask_per_dss, + sizeof(gt->fuse_topo.eu_mask_per_dss)); + if (IS_ERR(query_ptr)) + return PTR_ERR(query_ptr); + } + + return 0; +} + +static int (* const xe_query_funcs[])(struct xe_device *xe, + struct drm_xe_device_query *query) = { + query_engines, + query_mem_regions, + query_config, + query_gt_list, + query_hwconfig, + query_gt_topology, + query_engine_cycles, +}; + +int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct drm_xe_device_query *query = data; + u32 idx; + + if (XE_IOCTL_DBG(xe, query->extensions) || + XE_IOCTL_DBG(xe, query->reserved[0] || query->reserved[1])) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, query->query >= ARRAY_SIZE(xe_query_funcs))) + return -EINVAL; + + idx = array_index_nospec(query->query, ARRAY_SIZE(xe_query_funcs)); + if (XE_IOCTL_DBG(xe, !xe_query_funcs[idx])) + return -EINVAL; + + return xe_query_funcs[idx](xe, query); +} diff --git a/drivers/gpu/drm/xe/xe_query.h b/drivers/gpu/drm/xe/xe_query.h new file mode 100644 index 000000000000..beeb7a8192b4 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_query.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_QUERY_H_ +#define _XE_QUERY_H_ + +struct drm_device; +struct drm_file; + +int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file); + +#endif diff --git a/drivers/gpu/drm/xe/xe_range_fence.c b/drivers/gpu/drm/xe/xe_range_fence.c new file mode 100644 index 000000000000..d35d9ec58e86 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_range_fence.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/dma-fence.h> +#include <linux/interval_tree_generic.h> +#include <linux/slab.h> + +#include "xe_macros.h" +#include "xe_range_fence.h" + +#define XE_RANGE_TREE_START(_node) ((_node)->start) +#define XE_RANGE_TREE_LAST(_node) ((_node)->last) + +INTERVAL_TREE_DEFINE(struct xe_range_fence, rb, u64, __subtree_last, + XE_RANGE_TREE_START, XE_RANGE_TREE_LAST, static, + xe_range_fence_tree); + +static void +xe_range_fence_signal_notify(struct dma_fence *fence, struct dma_fence_cb *cb) +{ + struct xe_range_fence *rfence = container_of(cb, typeof(*rfence), cb); + struct xe_range_fence_tree *tree = rfence->tree; + + llist_add(&rfence->link, &tree->list); +} + +static bool __xe_range_fence_tree_cleanup(struct xe_range_fence_tree *tree) +{ + struct llist_node *node = llist_del_all(&tree->list); + struct xe_range_fence *rfence, *next; + + llist_for_each_entry_safe(rfence, next, node, link) { + xe_range_fence_tree_remove(rfence, &tree->root); + dma_fence_put(rfence->fence); + kfree(rfence); + } + + return !!node; +} + +/** + * xe_range_fence_insert() - range fence insert + * @tree: range fence tree to insert intoi + * @rfence: range fence + * @ops: range fence ops + * @start: start address of range fence + * @last: last address of range fence + * @fence: dma fence which signals range fence can be removed + freed + * + * Return: 0 on success, non-zero on failure + */ +int xe_range_fence_insert(struct xe_range_fence_tree *tree, + struct xe_range_fence *rfence, + const struct xe_range_fence_ops *ops, + u64 start, u64 last, struct dma_fence *fence) +{ + int err = 0; + + __xe_range_fence_tree_cleanup(tree); + + if (dma_fence_is_signaled(fence)) + goto free; + + rfence->ops = ops; + rfence->start = start; + rfence->last = last; + rfence->tree = tree; + rfence->fence = dma_fence_get(fence); + err = dma_fence_add_callback(fence, &rfence->cb, + xe_range_fence_signal_notify); + if (err == -ENOENT) { + dma_fence_put(fence); + err = 0; + goto free; + } else if (err == 0) { + xe_range_fence_tree_insert(rfence, &tree->root); + return 0; + } + +free: + if (ops->free) + ops->free(rfence); + + return err; +} + +static void xe_range_fence_tree_remove_all(struct xe_range_fence_tree *tree) +{ + struct xe_range_fence *rfence; + bool retry = true; + + rfence = xe_range_fence_tree_iter_first(&tree->root, 0, U64_MAX); + while (rfence) { + /* Should be ok with the minimalistic callback */ + if (dma_fence_remove_callback(rfence->fence, &rfence->cb)) + llist_add(&rfence->link, &tree->list); + rfence = xe_range_fence_tree_iter_next(rfence, 0, U64_MAX); + } + + while (retry) + retry = __xe_range_fence_tree_cleanup(tree); +} + +/** + * xe_range_fence_tree_init() - Init range fence tree + * @tree: range fence tree + */ +void xe_range_fence_tree_init(struct xe_range_fence_tree *tree) +{ + memset(tree, 0, sizeof(*tree)); +} + +/** + * xe_range_fence_tree_fini() - Fini range fence tree + * @tree: range fence tree + */ +void xe_range_fence_tree_fini(struct xe_range_fence_tree *tree) +{ + xe_range_fence_tree_remove_all(tree); + XE_WARN_ON(!RB_EMPTY_ROOT(&tree->root.rb_root)); +} + +/** + * xe_range_fence_tree_first() - range fence tree iterator first + * @tree: range fence tree + * @start: start address of range fence + * @last: last address of range fence + * + * Return: first range fence found in range or NULL + */ +struct xe_range_fence * +xe_range_fence_tree_first(struct xe_range_fence_tree *tree, u64 start, + u64 last) +{ + return xe_range_fence_tree_iter_first(&tree->root, start, last); +} + +/** + * xe_range_fence_tree_next() - range fence tree iterator next + * @rfence: current range fence + * @start: start address of range fence + * @last: last address of range fence + * + * Return: next range fence found in range or NULL + */ +struct xe_range_fence * +xe_range_fence_tree_next(struct xe_range_fence *rfence, u64 start, u64 last) +{ + return xe_range_fence_tree_iter_next(rfence, start, last); +} + +const struct xe_range_fence_ops xe_range_fence_kfree_ops = { + .free = (void (*)(struct xe_range_fence *rfence)) kfree, +}; diff --git a/drivers/gpu/drm/xe/xe_range_fence.h b/drivers/gpu/drm/xe/xe_range_fence.h new file mode 100644 index 000000000000..edd58b34f5c0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_range_fence.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_RANGE_FENCE_H_ +#define _XE_RANGE_FENCE_H_ + +#include <linux/dma-fence.h> +#include <linux/rbtree.h> +#include <linux/types.h> + +struct xe_range_fence_tree; +struct xe_range_fence; + +/** struct xe_range_fence_ops - XE range fence ops */ +struct xe_range_fence_ops { + /** @free: free range fence op */ + void (*free)(struct xe_range_fence *rfence); +}; + +/** struct xe_range_fence - XE range fence (address conflict tracking) */ +struct xe_range_fence { + /** @rb: RB tree node inserted into interval tree */ + struct rb_node rb; + /** @start: start address of range fence is interval tree */ + u64 start; + /** @last: last address (inclusive) of range fence is interval tree */ + u64 last; + /** @__subtree_last: interval tree internal usage */ + u64 __subtree_last; + /** + * @fence: fence signals address in range fence no longer has conflict + */ + struct dma_fence *fence; + /** @tree: interval tree which range fence belongs to */ + struct xe_range_fence_tree *tree; + /** + * @cb: callback when fence signals to remove range fence free from interval tree + */ + struct dma_fence_cb cb; + /** @link: used to defer free of range fence to non-irq context */ + struct llist_node link; + /** @ops: range fence ops */ + const struct xe_range_fence_ops *ops; +}; + +/** struct xe_range_fence_tree - interval tree to store range fences */ +struct xe_range_fence_tree { + /** @root: interval tree root */ + struct rb_root_cached root; + /** @list: list of pending range fences to be freed */ + struct llist_head list; +}; + +extern const struct xe_range_fence_ops xe_range_fence_kfree_ops; + +struct xe_range_fence * +xe_range_fence_tree_first(struct xe_range_fence_tree *tree, u64 start, + u64 last); + +struct xe_range_fence * +xe_range_fence_tree_next(struct xe_range_fence *rfence, u64 start, u64 last); + +void xe_range_fence_tree_init(struct xe_range_fence_tree *tree); + +void xe_range_fence_tree_fini(struct xe_range_fence_tree *tree); + +int xe_range_fence_insert(struct xe_range_fence_tree *tree, + struct xe_range_fence *rfence, + const struct xe_range_fence_ops *ops, + u64 start, u64 end, + struct dma_fence *fence); + +#endif diff --git a/drivers/gpu/drm/xe/xe_reg_sr.c b/drivers/gpu/drm/xe/xe_reg_sr.c new file mode 100644 index 000000000000..87adefb56024 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_reg_sr.c @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_reg_sr.h" + +#include <kunit/visibility.h> +#include <linux/align.h> +#include <linux/string_helpers.h> +#include <linux/xarray.h> + +#include <drm/drm_managed.h> +#include <drm/drm_print.h> + +#include "regs/xe_engine_regs.h" +#include "regs/xe_gt_regs.h" +#include "xe_device_types.h" +#include "xe_force_wake.h" +#include "xe_gt.h" +#include "xe_gt_mcr.h" +#include "xe_gt_printk.h" +#include "xe_hw_engine_types.h" +#include "xe_macros.h" +#include "xe_mmio.h" +#include "xe_reg_whitelist.h" +#include "xe_rtp_types.h" + +#define XE_REG_SR_GROW_STEP_DEFAULT 16 + +static void reg_sr_fini(struct drm_device *drm, void *arg) +{ + struct xe_reg_sr *sr = arg; + + xa_destroy(&sr->xa); + kfree(sr->pool.arr); + memset(&sr->pool, 0, sizeof(sr->pool)); +} + +int xe_reg_sr_init(struct xe_reg_sr *sr, const char *name, struct xe_device *xe) +{ + xa_init(&sr->xa); + memset(&sr->pool, 0, sizeof(sr->pool)); + sr->pool.grow_step = XE_REG_SR_GROW_STEP_DEFAULT; + sr->name = name; + + return drmm_add_action_or_reset(&xe->drm, reg_sr_fini, sr); +} +EXPORT_SYMBOL_IF_KUNIT(xe_reg_sr_init); + +static struct xe_reg_sr_entry *alloc_entry(struct xe_reg_sr *sr) +{ + if (sr->pool.used == sr->pool.allocated) { + struct xe_reg_sr_entry *arr; + + arr = krealloc_array(sr->pool.arr, + ALIGN(sr->pool.allocated + 1, sr->pool.grow_step), + sizeof(*arr), GFP_KERNEL); + if (!arr) + return NULL; + + sr->pool.arr = arr; + sr->pool.allocated += sr->pool.grow_step; + } + + return &sr->pool.arr[sr->pool.used++]; +} + +static bool compatible_entries(const struct xe_reg_sr_entry *e1, + const struct xe_reg_sr_entry *e2) +{ + /* + * Don't allow overwriting values: clr_bits/set_bits should be disjoint + * when operating in the same register + */ + if (e1->clr_bits & e2->clr_bits || e1->set_bits & e2->set_bits || + e1->clr_bits & e2->set_bits || e1->set_bits & e2->clr_bits) + return false; + + if (e1->reg.raw != e2->reg.raw) + return false; + + return true; +} + +static void reg_sr_inc_error(struct xe_reg_sr *sr) +{ +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) + sr->errors++; +#endif +} + +int xe_reg_sr_add(struct xe_reg_sr *sr, + const struct xe_reg_sr_entry *e, + struct xe_gt *gt) +{ + unsigned long idx = e->reg.addr; + struct xe_reg_sr_entry *pentry = xa_load(&sr->xa, idx); + int ret; + + if (pentry) { + if (!compatible_entries(pentry, e)) { + ret = -EINVAL; + goto fail; + } + + pentry->clr_bits |= e->clr_bits; + pentry->set_bits |= e->set_bits; + pentry->read_mask |= e->read_mask; + + return 0; + } + + pentry = alloc_entry(sr); + if (!pentry) { + ret = -ENOMEM; + goto fail; + } + + *pentry = *e; + ret = xa_err(xa_store(&sr->xa, idx, pentry, GFP_KERNEL)); + if (ret) + goto fail; + + return 0; + +fail: + xe_gt_err(gt, + "discarding save-restore reg %04lx (clear: %08x, set: %08x, masked: %s, mcr: %s): ret=%d\n", + idx, e->clr_bits, e->set_bits, + str_yes_no(e->reg.masked), + str_yes_no(e->reg.mcr), + ret); + reg_sr_inc_error(sr); + + return ret; +} + +/* + * Convert back from encoded value to type-safe, only to be used when reg.mcr + * is true + */ +static struct xe_reg_mcr to_xe_reg_mcr(const struct xe_reg reg) +{ + return (const struct xe_reg_mcr){.__reg.raw = reg.raw }; +} + +static void apply_one_mmio(struct xe_gt *gt, struct xe_reg_sr_entry *entry) +{ + struct xe_reg reg = entry->reg; + struct xe_reg_mcr reg_mcr = to_xe_reg_mcr(reg); + u32 val; + + /* + * If this is a masked register, need to set the upper 16 bits. + * Set them to clr_bits since that is always a superset of the bits + * being modified. + * + * When it's not masked, we have to read it from hardware, unless we are + * supposed to set all bits. + */ + if (reg.masked) + val = entry->clr_bits << 16; + else if (entry->clr_bits + 1) + val = (reg.mcr ? + xe_gt_mcr_unicast_read_any(gt, reg_mcr) : + xe_mmio_read32(gt, reg)) & (~entry->clr_bits); + else + val = 0; + + /* + * TODO: add selftest to validate all tables, regardless of platform: + * - Masked registers can't have set_bits with upper bits set + * - set_bits must be contained in clr_bits + */ + val |= entry->set_bits; + + xe_gt_dbg(gt, "REG[0x%x] = 0x%08x", reg.addr, val); + + if (entry->reg.mcr) + xe_gt_mcr_multicast_write(gt, reg_mcr, val); + else + xe_mmio_write32(gt, reg, val); +} + +void xe_reg_sr_apply_mmio(struct xe_reg_sr *sr, struct xe_gt *gt) +{ + struct xe_reg_sr_entry *entry; + unsigned long reg; + int err; + + if (xa_empty(&sr->xa)) + return; + + xe_gt_dbg(gt, "Applying %s save-restore MMIOs\n", sr->name); + + err = xe_force_wake_get(>->mmio.fw, XE_FORCEWAKE_ALL); + if (err) + goto err_force_wake; + + xa_for_each(&sr->xa, reg, entry) + apply_one_mmio(gt, entry); + + err = xe_force_wake_put(>->mmio.fw, XE_FORCEWAKE_ALL); + XE_WARN_ON(err); + + return; + +err_force_wake: + xe_gt_err(gt, "Failed to apply, err=%d\n", err); +} + +void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe) +{ + struct xe_reg_sr *sr = &hwe->reg_whitelist; + struct xe_gt *gt = hwe->gt; + struct xe_device *xe = gt_to_xe(gt); + struct xe_reg_sr_entry *entry; + struct drm_printer p; + u32 mmio_base = hwe->mmio_base; + unsigned long reg; + unsigned int slot = 0; + int err; + + if (xa_empty(&sr->xa)) + return; + + drm_dbg(&xe->drm, "Whitelisting %s registers\n", sr->name); + + err = xe_force_wake_get(>->mmio.fw, XE_FORCEWAKE_ALL); + if (err) + goto err_force_wake; + + p = drm_debug_printer(KBUILD_MODNAME); + xa_for_each(&sr->xa, reg, entry) { + if (slot == RING_MAX_NONPRIV_SLOTS) { + xe_gt_err(gt, + "hwe %s: maximum register whitelist slots (%d) reached, refusing to add more\n", + hwe->name, RING_MAX_NONPRIV_SLOTS); + break; + } + + xe_reg_whitelist_print_entry(&p, 0, reg, entry); + xe_mmio_write32(gt, RING_FORCE_TO_NONPRIV(mmio_base, slot), + reg | entry->set_bits); + slot++; + } + + /* And clear the rest just in case of garbage */ + for (; slot < RING_MAX_NONPRIV_SLOTS; slot++) { + u32 addr = RING_NOPID(mmio_base).addr; + + xe_mmio_write32(gt, RING_FORCE_TO_NONPRIV(mmio_base, slot), addr); + } + + err = xe_force_wake_put(>->mmio.fw, XE_FORCEWAKE_ALL); + XE_WARN_ON(err); + + return; + +err_force_wake: + drm_err(&xe->drm, "Failed to apply, err=%d\n", err); +} + +/** + * xe_reg_sr_dump - print all save/restore entries + * @sr: Save/restore entries + * @p: DRM printer + */ +void xe_reg_sr_dump(struct xe_reg_sr *sr, struct drm_printer *p) +{ + struct xe_reg_sr_entry *entry; + unsigned long reg; + + if (!sr->name || xa_empty(&sr->xa)) + return; + + drm_printf(p, "%s\n", sr->name); + xa_for_each(&sr->xa, reg, entry) + drm_printf(p, "\tREG[0x%lx] clr=0x%08x set=0x%08x masked=%s mcr=%s\n", + reg, entry->clr_bits, entry->set_bits, + str_yes_no(entry->reg.masked), + str_yes_no(entry->reg.mcr)); +} diff --git a/drivers/gpu/drm/xe/xe_reg_sr.h b/drivers/gpu/drm/xe/xe_reg_sr.h new file mode 100644 index 000000000000..e3197c33afe2 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_reg_sr.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_REG_SR_ +#define _XE_REG_SR_ + +#include "xe_reg_sr_types.h" + +/* + * Reg save/restore bookkeeping + */ + +struct xe_device; +struct xe_gt; +struct xe_hw_engine; +struct drm_printer; + +int xe_reg_sr_init(struct xe_reg_sr *sr, const char *name, struct xe_device *xe); +void xe_reg_sr_dump(struct xe_reg_sr *sr, struct drm_printer *p); + +int xe_reg_sr_add(struct xe_reg_sr *sr, const struct xe_reg_sr_entry *e, + struct xe_gt *gt); +void xe_reg_sr_apply_mmio(struct xe_reg_sr *sr, struct xe_gt *gt); +void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_reg_sr_types.h b/drivers/gpu/drm/xe/xe_reg_sr_types.h new file mode 100644 index 000000000000..ad48a52b824a --- /dev/null +++ b/drivers/gpu/drm/xe/xe_reg_sr_types.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_REG_SR_TYPES_ +#define _XE_REG_SR_TYPES_ + +#include <linux/types.h> +#include <linux/xarray.h> + +#include "regs/xe_reg_defs.h" + +struct xe_reg_sr_entry { + struct xe_reg reg; + u32 clr_bits; + u32 set_bits; + /* Mask for bits to consider when reading value back */ + u32 read_mask; +}; + +struct xe_reg_sr { + struct { + struct xe_reg_sr_entry *arr; + unsigned int used; + unsigned int allocated; + unsigned int grow_step; + } pool; + struct xarray xa; + const char *name; + +#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) + unsigned int errors; +#endif +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_reg_whitelist.c b/drivers/gpu/drm/xe/xe_reg_whitelist.c new file mode 100644 index 000000000000..e66ae1bdaf9c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_reg_whitelist.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_reg_whitelist.h" + +#include "regs/xe_engine_regs.h" +#include "regs/xe_gt_regs.h" +#include "xe_gt_types.h" +#include "xe_platform_types.h" +#include "xe_rtp.h" + +#undef XE_REG_MCR +#define XE_REG_MCR(...) XE_REG(__VA_ARGS__, .mcr = 1) + +static bool match_not_render(const struct xe_gt *gt, + const struct xe_hw_engine *hwe) +{ + return hwe->class != XE_ENGINE_CLASS_RENDER; +} + +static const struct xe_rtp_entry_sr register_whitelist[] = { + { XE_RTP_NAME("WaAllowPMDepthAndInvocationCountAccessFromUMD, 1408556865"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(WHITELIST(PS_INVOCATION_COUNT, + RING_FORCE_TO_NONPRIV_ACCESS_RD | + RING_FORCE_TO_NONPRIV_RANGE_4)) + }, + { XE_RTP_NAME("1508744258, 14012131227, 1808121037"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(WHITELIST(COMMON_SLICE_CHICKEN1, 0)) + }, + { XE_RTP_NAME("1806527549"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(WHITELIST(HIZ_CHICKEN, 0)) + }, + { XE_RTP_NAME("allow_read_ctx_timestamp"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1260), FUNC(match_not_render)), + XE_RTP_ACTIONS(WHITELIST(RING_CTX_TIMESTAMP(0), + RING_FORCE_TO_NONPRIV_ACCESS_RD, + XE_RTP_ACTION_FLAG(ENGINE_BASE))) + }, + { XE_RTP_NAME("16014440446"), + XE_RTP_RULES(PLATFORM(PVC)), + XE_RTP_ACTIONS(WHITELIST(XE_REG(0x4400), + RING_FORCE_TO_NONPRIV_DENY | + RING_FORCE_TO_NONPRIV_RANGE_64), + WHITELIST(XE_REG(0x4500), + RING_FORCE_TO_NONPRIV_DENY | + RING_FORCE_TO_NONPRIV_RANGE_64)) + }, + { XE_RTP_NAME("16017236439"), + XE_RTP_RULES(PLATFORM(PVC), ENGINE_CLASS(COPY)), + XE_RTP_ACTIONS(WHITELIST(BCS_SWCTRL(0), + RING_FORCE_TO_NONPRIV_DENY, + XE_RTP_ACTION_FLAG(ENGINE_BASE))) + }, + {} +}; + +/** + * xe_reg_whitelist_process_engine - process table of registers to whitelist + * @hwe: engine instance to process whitelist for + * + * Process wwhitelist table for this platform, saving in @hwe all the + * registers that need to be whitelisted by the hardware so they can be accessed + * by userspace. + */ +void xe_reg_whitelist_process_engine(struct xe_hw_engine *hwe) +{ + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe); + + xe_rtp_process_to_sr(&ctx, register_whitelist, &hwe->reg_whitelist); +} + +/** + * xe_reg_whitelist_print_entry - print one whitelist entry + * @p: DRM printer + * @indent: indent level + * @reg: register allowed/denied + * @entry: save-restore entry + * + * Print details about the entry added to allow/deny access + */ +void xe_reg_whitelist_print_entry(struct drm_printer *p, unsigned int indent, + u32 reg, struct xe_reg_sr_entry *entry) +{ + u32 val = entry->set_bits; + const char *access_str = "(invalid)"; + unsigned int range_bit = 2; + u32 range_start, range_end; + bool deny; + + deny = val & RING_FORCE_TO_NONPRIV_DENY; + + switch (val & RING_FORCE_TO_NONPRIV_RANGE_MASK) { + case RING_FORCE_TO_NONPRIV_RANGE_4: + range_bit = 4; + break; + case RING_FORCE_TO_NONPRIV_RANGE_16: + range_bit = 6; + break; + case RING_FORCE_TO_NONPRIV_RANGE_64: + range_bit = 8; + break; + } + + range_start = reg & REG_GENMASK(25, range_bit); + range_end = range_start | REG_GENMASK(range_bit, 0); + + switch (val & RING_FORCE_TO_NONPRIV_ACCESS_MASK) { + case RING_FORCE_TO_NONPRIV_ACCESS_RW: + access_str = "rw"; + break; + case RING_FORCE_TO_NONPRIV_ACCESS_RD: + access_str = "read"; + break; + case RING_FORCE_TO_NONPRIV_ACCESS_WR: + access_str = "write"; + break; + } + + drm_printf_indent(p, indent, "REG[0x%x-0x%x]: %s %s access\n", + range_start, range_end, + deny ? "deny" : "allow", + access_str); +} + +/** + * xe_reg_whitelist_dump - print all whitelist entries + * @sr: Save/restore entries + * @p: DRM printer + */ +void xe_reg_whitelist_dump(struct xe_reg_sr *sr, struct drm_printer *p) +{ + struct xe_reg_sr_entry *entry; + unsigned long reg; + + if (!sr->name || xa_empty(&sr->xa)) + return; + + drm_printf(p, "%s\n", sr->name); + xa_for_each(&sr->xa, reg, entry) + xe_reg_whitelist_print_entry(p, 1, reg, entry); +} diff --git a/drivers/gpu/drm/xe/xe_reg_whitelist.h b/drivers/gpu/drm/xe/xe_reg_whitelist.h new file mode 100644 index 000000000000..69b121d377da --- /dev/null +++ b/drivers/gpu/drm/xe/xe_reg_whitelist.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_REG_WHITELIST_ +#define _XE_REG_WHITELIST_ + +#include <linux/types.h> + +struct drm_printer; +struct xe_hw_engine; +struct xe_reg_sr; +struct xe_reg_sr_entry; + +void xe_reg_whitelist_process_engine(struct xe_hw_engine *hwe); + +void xe_reg_whitelist_print_entry(struct drm_printer *p, unsigned int indent, + u32 reg, struct xe_reg_sr_entry *entry); + +void xe_reg_whitelist_dump(struct xe_reg_sr *sr, struct drm_printer *p); + +#endif diff --git a/drivers/gpu/drm/xe/xe_res_cursor.h b/drivers/gpu/drm/xe/xe_res_cursor.h new file mode 100644 index 000000000000..0a306963aa8e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_res_cursor.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _XE_RES_CURSOR_H_ +#define _XE_RES_CURSOR_H_ + +#include <linux/scatterlist.h> + +#include <drm/drm_mm.h> +#include <drm/ttm/ttm_placement.h> +#include <drm/ttm/ttm_range_manager.h> +#include <drm/ttm/ttm_resource.h> +#include <drm/ttm/ttm_tt.h> + +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_macros.h" +#include "xe_ttm_vram_mgr.h" + +/* state back for walking over vram_mgr, stolen_mgr, and gtt_mgr allocations */ +struct xe_res_cursor { + u64 start; + u64 size; + u64 remaining; + void *node; + u32 mem_type; + struct scatterlist *sgl; + struct drm_buddy *mm; +}; + +static struct drm_buddy *xe_res_get_buddy(struct ttm_resource *res) +{ + struct ttm_resource_manager *mgr; + + mgr = ttm_manager_type(res->bo->bdev, res->mem_type); + return &to_xe_ttm_vram_mgr(mgr)->mm; +} + +/** + * xe_res_first - initialize a xe_res_cursor + * + * @res: TTM resource object to walk + * @start: Start of the range + * @size: Size of the range + * @cur: cursor object to initialize + * + * Start walking over the range of allocations between @start and @size. + */ +static inline void xe_res_first(struct ttm_resource *res, + u64 start, u64 size, + struct xe_res_cursor *cur) +{ + cur->sgl = NULL; + if (!res) + goto fallback; + + XE_WARN_ON(start + size > res->size); + + cur->mem_type = res->mem_type; + + switch (cur->mem_type) { + case XE_PL_STOLEN: + case XE_PL_VRAM0: + case XE_PL_VRAM1: { + struct drm_buddy_block *block; + struct list_head *head, *next; + struct drm_buddy *mm = xe_res_get_buddy(res); + + head = &to_xe_ttm_vram_mgr_resource(res)->blocks; + + block = list_first_entry_or_null(head, + struct drm_buddy_block, + link); + if (!block) + goto fallback; + + while (start >= drm_buddy_block_size(mm, block)) { + start -= drm_buddy_block_size(mm, block); + + next = block->link.next; + if (next != head) + block = list_entry(next, struct drm_buddy_block, + link); + } + + cur->mm = mm; + cur->start = drm_buddy_block_offset(block) + start; + cur->size = min(drm_buddy_block_size(mm, block) - start, + size); + cur->remaining = size; + cur->node = block; + break; + } + default: + goto fallback; + } + + return; + +fallback: + cur->start = start; + cur->size = size; + cur->remaining = size; + cur->node = NULL; + cur->mem_type = XE_PL_TT; + XE_WARN_ON(res && start + size > res->size); +} + +static inline void __xe_res_sg_next(struct xe_res_cursor *cur) +{ + struct scatterlist *sgl = cur->sgl; + u64 start = cur->start; + + while (start >= sg_dma_len(sgl)) { + start -= sg_dma_len(sgl); + sgl = sg_next(sgl); + XE_WARN_ON(!sgl); + } + + cur->start = start; + cur->size = sg_dma_len(sgl) - start; + cur->sgl = sgl; +} + +/** + * xe_res_first_sg - initialize a xe_res_cursor with a scatter gather table + * + * @sg: scatter gather table to walk + * @start: Start of the range + * @size: Size of the range + * @cur: cursor object to initialize + * + * Start walking over the range of allocations between @start and @size. + */ +static inline void xe_res_first_sg(const struct sg_table *sg, + u64 start, u64 size, + struct xe_res_cursor *cur) +{ + XE_WARN_ON(!sg); + XE_WARN_ON(!IS_ALIGNED(start, PAGE_SIZE) || + !IS_ALIGNED(size, PAGE_SIZE)); + cur->node = NULL; + cur->start = start; + cur->remaining = size; + cur->size = 0; + cur->sgl = sg->sgl; + cur->mem_type = XE_PL_TT; + __xe_res_sg_next(cur); +} + +/** + * xe_res_next - advance the cursor + * + * @cur: the cursor to advance + * @size: number of bytes to move forward + * + * Move the cursor @size bytes forwrad, walking to the next node if necessary. + */ +static inline void xe_res_next(struct xe_res_cursor *cur, u64 size) +{ + struct drm_buddy_block *block; + struct list_head *next; + u64 start; + + XE_WARN_ON(size > cur->remaining); + + cur->remaining -= size; + if (!cur->remaining) + return; + + if (cur->size > size) { + cur->size -= size; + cur->start += size; + return; + } + + if (cur->sgl) { + cur->start += size; + __xe_res_sg_next(cur); + return; + } + + switch (cur->mem_type) { + case XE_PL_STOLEN: + case XE_PL_VRAM0: + case XE_PL_VRAM1: + start = size - cur->size; + block = cur->node; + + next = block->link.next; + block = list_entry(next, struct drm_buddy_block, link); + + + while (start >= drm_buddy_block_size(cur->mm, block)) { + start -= drm_buddy_block_size(cur->mm, block); + + next = block->link.next; + block = list_entry(next, struct drm_buddy_block, link); + } + + cur->start = drm_buddy_block_offset(block) + start; + cur->size = min(drm_buddy_block_size(cur->mm, block) - start, + cur->remaining); + cur->node = block; + break; + default: + return; + } +} + +/** + * xe_res_dma - return dma address of cursor at current position + * + * @cur: the cursor to return the dma address from + */ +static inline u64 xe_res_dma(const struct xe_res_cursor *cur) +{ + return cur->sgl ? sg_dma_address(cur->sgl) + cur->start : cur->start; +} +#endif diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c new file mode 100644 index 000000000000..1e4c06eacd98 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_ring_ops.h" + +#include "generated/xe_wa_oob.h" +#include "instructions/xe_mi_commands.h" +#include "regs/xe_engine_regs.h" +#include "regs/xe_gpu_commands.h" +#include "regs/xe_gt_regs.h" +#include "regs/xe_lrc_layout.h" +#include "xe_exec_queue_types.h" +#include "xe_gt.h" +#include "xe_lrc.h" +#include "xe_macros.h" +#include "xe_sched_job.h" +#include "xe_vm_types.h" +#include "xe_vm.h" +#include "xe_wa.h" + +/* + * 3D-related flags that can't be set on _engines_ that lack access to the 3D + * pipeline (i.e., CCS engines). + */ +#define PIPE_CONTROL_3D_ENGINE_FLAGS (\ + PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | \ + PIPE_CONTROL_DEPTH_CACHE_FLUSH | \ + PIPE_CONTROL_TILE_CACHE_FLUSH | \ + PIPE_CONTROL_DEPTH_STALL | \ + PIPE_CONTROL_STALL_AT_SCOREBOARD | \ + PIPE_CONTROL_PSD_SYNC | \ + PIPE_CONTROL_AMFS_FLUSH | \ + PIPE_CONTROL_VF_CACHE_INVALIDATE | \ + PIPE_CONTROL_GLOBAL_SNAPSHOT_RESET) + +/* 3D-related flags that can't be set on _platforms_ that lack a 3D pipeline */ +#define PIPE_CONTROL_3D_ARCH_FLAGS ( \ + PIPE_CONTROL_3D_ENGINE_FLAGS | \ + PIPE_CONTROL_INDIRECT_STATE_DISABLE | \ + PIPE_CONTROL_FLUSH_ENABLE | \ + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \ + PIPE_CONTROL_DC_FLUSH_ENABLE) + +static u32 preparser_disable(bool state) +{ + return MI_ARB_CHECK | BIT(8) | state; +} + +static int emit_aux_table_inv(struct xe_gt *gt, struct xe_reg reg, + u32 *dw, int i) +{ + dw[i++] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1) | MI_LRI_MMIO_REMAP_EN; + dw[i++] = reg.addr + gt->mmio.adj_offset; + dw[i++] = AUX_INV; + dw[i++] = MI_NOOP; + + return i; +} + +static int emit_user_interrupt(u32 *dw, int i) +{ + dw[i++] = MI_USER_INTERRUPT; + dw[i++] = MI_ARB_ON_OFF | MI_ARB_ENABLE; + dw[i++] = MI_ARB_CHECK; + + return i; +} + +static int emit_store_imm_ggtt(u32 addr, u32 value, u32 *dw, int i) +{ + dw[i++] = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); + dw[i++] = addr; + dw[i++] = 0; + dw[i++] = value; + + return i; +} + +static int emit_flush_imm_ggtt(u32 addr, u32 value, bool invalidate_tlb, + u32 *dw, int i) +{ + dw[i++] = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW | + (invalidate_tlb ? MI_INVALIDATE_TLB : 0); + dw[i++] = addr | MI_FLUSH_DW_USE_GTT; + dw[i++] = 0; + dw[i++] = value; + + return i; +} + +static int emit_bb_start(u64 batch_addr, u32 ppgtt_flag, u32 *dw, int i) +{ + dw[i++] = MI_BATCH_BUFFER_START | ppgtt_flag | XE_INSTR_NUM_DW(3); + dw[i++] = lower_32_bits(batch_addr); + dw[i++] = upper_32_bits(batch_addr); + + return i; +} + +static int emit_flush_invalidate(u32 flag, u32 *dw, int i) +{ + dw[i] = MI_FLUSH_DW; + dw[i] |= flag; + dw[i++] |= MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW | + MI_FLUSH_DW_STORE_INDEX; + + dw[i++] = LRC_PPHWSP_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; + dw[i++] = 0; + dw[i++] = ~0U; + + return i; +} + +static int emit_pipe_invalidate(u32 mask_flags, bool invalidate_tlb, u32 *dw, + int i) +{ + u32 flags = PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_COMMAND_CACHE_INVALIDATE | + PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE | + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_VF_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE | + PIPE_CONTROL_STATE_CACHE_INVALIDATE | + PIPE_CONTROL_QW_WRITE | + PIPE_CONTROL_STORE_DATA_INDEX; + + if (invalidate_tlb) + flags |= PIPE_CONTROL_TLB_INVALIDATE; + + flags &= ~mask_flags; + + dw[i++] = GFX_OP_PIPE_CONTROL(6); + dw[i++] = flags; + dw[i++] = LRC_PPHWSP_SCRATCH_ADDR; + dw[i++] = 0; + dw[i++] = 0; + dw[i++] = 0; + + return i; +} + +static int emit_store_imm_ppgtt_posted(u64 addr, u64 value, + u32 *dw, int i) +{ + dw[i++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(1); + dw[i++] = lower_32_bits(addr); + dw[i++] = upper_32_bits(addr); + dw[i++] = lower_32_bits(value); + dw[i++] = upper_32_bits(value); + + return i; +} + +static int emit_render_cache_flush(struct xe_sched_job *job, u32 *dw, int i) +{ + struct xe_gt *gt = job->q->gt; + bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK); + u32 flags; + + flags = (PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_TILE_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_FLUSH_ENABLE); + + if (XE_WA(gt, 1409600907)) + flags |= PIPE_CONTROL_DEPTH_STALL; + + if (lacks_render) + flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS; + else if (job->q->class == XE_ENGINE_CLASS_COMPUTE) + flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS; + + dw[i++] = GFX_OP_PIPE_CONTROL(6) | PIPE_CONTROL0_HDC_PIPELINE_FLUSH; + dw[i++] = flags; + dw[i++] = 0; + dw[i++] = 0; + dw[i++] = 0; + dw[i++] = 0; + + return i; +} + +static int emit_pipe_control_to_ring_end(struct xe_hw_engine *hwe, u32 *dw, int i) +{ + if (hwe->class != XE_ENGINE_CLASS_RENDER) + return i; + + if (XE_WA(hwe->gt, 16020292621)) { + dw[i++] = GFX_OP_PIPE_CONTROL(6); + dw[i++] = PIPE_CONTROL_LRI_POST_SYNC; + dw[i++] = RING_NOPID(hwe->mmio_base).addr; + dw[i++] = 0; + dw[i++] = 0; + dw[i++] = 0; + } + + return i; +} + +static int emit_pipe_imm_ggtt(u32 addr, u32 value, bool stall_only, u32 *dw, + int i) +{ + dw[i++] = GFX_OP_PIPE_CONTROL(6); + dw[i++] = (stall_only ? PIPE_CONTROL_CS_STALL : + PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL) | + PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_QW_WRITE; + dw[i++] = addr; + dw[i++] = 0; + dw[i++] = value; + dw[i++] = 0; /* We're thrashing one extra dword. */ + + return i; +} + +static u32 get_ppgtt_flag(struct xe_sched_job *job) +{ + return job->q->vm ? BIT(8) : 0; +} + +/* for engines that don't require any special HW handling (no EUs, no aux inval, etc) */ +static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc, + u64 batch_addr, u32 seqno) +{ + u32 dw[MAX_JOB_SIZE_DW], i = 0; + u32 ppgtt_flag = get_ppgtt_flag(job); + struct xe_vm *vm = job->q->vm; + struct xe_gt *gt = job->q->gt; + + if (vm && vm->batch_invalidate_tlb) { + dw[i++] = preparser_disable(true); + i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, true, dw, i); + dw[i++] = preparser_disable(false); + } else { + i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, dw, i); + } + + i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); + + if (job->user_fence.used) + i = emit_store_imm_ppgtt_posted(job->user_fence.addr, + job->user_fence.value, + dw, i); + + i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i); + + i = emit_user_interrupt(dw, i); + + xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW); + + xe_lrc_write_ring(lrc, dw, i * sizeof(*dw)); +} + +static bool has_aux_ccs(struct xe_device *xe) +{ + /* + * PVC is a special case that has no compression of either type + * (FlatCCS or AuxCCS). Also, AuxCCS is no longer used from Xe2 + * onward, so any future platforms with no FlatCCS will not have + * AuxCCS either. + */ + if (GRAPHICS_VER(xe) >= 20 || xe->info.platform == XE_PVC) + return false; + + return !xe->info.has_flat_ccs; +} + +static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, + u64 batch_addr, u32 seqno) +{ + u32 dw[MAX_JOB_SIZE_DW], i = 0; + u32 ppgtt_flag = get_ppgtt_flag(job); + struct xe_gt *gt = job->q->gt; + struct xe_device *xe = gt_to_xe(gt); + bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE; + struct xe_vm *vm = job->q->vm; + + dw[i++] = preparser_disable(true); + + /* hsdes: 1809175790 */ + if (has_aux_ccs(xe)) { + if (decode) + i = emit_aux_table_inv(gt, VD0_AUX_INV, dw, i); + else + i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i); + } + + if (vm && vm->batch_invalidate_tlb) + i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, true, dw, i); + + dw[i++] = preparser_disable(false); + + if (!vm || !vm->batch_invalidate_tlb) + i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, dw, i); + + i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); + + if (job->user_fence.used) + i = emit_store_imm_ppgtt_posted(job->user_fence.addr, + job->user_fence.value, + dw, i); + + i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, false, dw, i); + + i = emit_user_interrupt(dw, i); + + xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW); + + xe_lrc_write_ring(lrc, dw, i * sizeof(*dw)); +} + +static void __emit_job_gen12_render_compute(struct xe_sched_job *job, + struct xe_lrc *lrc, + u64 batch_addr, u32 seqno) +{ + u32 dw[MAX_JOB_SIZE_DW], i = 0; + u32 ppgtt_flag = get_ppgtt_flag(job); + struct xe_gt *gt = job->q->gt; + struct xe_device *xe = gt_to_xe(gt); + bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK); + struct xe_vm *vm = job->q->vm; + u32 mask_flags = 0; + + dw[i++] = preparser_disable(true); + if (lacks_render) + mask_flags = PIPE_CONTROL_3D_ARCH_FLAGS; + else if (job->q->class == XE_ENGINE_CLASS_COMPUTE) + mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS; + + /* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */ + i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i); + + /* hsdes: 1809175790 */ + if (has_aux_ccs(xe)) + i = emit_aux_table_inv(gt, CCS_AUX_INV, dw, i); + + dw[i++] = preparser_disable(false); + + i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, dw, i); + + i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); + + i = emit_render_cache_flush(job, dw, i); + + if (job->user_fence.used) + i = emit_store_imm_ppgtt_posted(job->user_fence.addr, + job->user_fence.value, + dw, i); + + i = emit_pipe_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, lacks_render, dw, i); + + i = emit_user_interrupt(dw, i); + + i = emit_pipe_control_to_ring_end(job->q->hwe, dw, i); + + xe_gt_assert(gt, i <= MAX_JOB_SIZE_DW); + + xe_lrc_write_ring(lrc, dw, i * sizeof(*dw)); +} + +static void emit_migration_job_gen12(struct xe_sched_job *job, + struct xe_lrc *lrc, u32 seqno) +{ + u32 dw[MAX_JOB_SIZE_DW], i = 0; + + i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), + seqno, dw, i); + + dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* Enabled again below */ + + i = emit_bb_start(job->batch_addr[0], BIT(8), dw, i); + + /* XXX: Do we need this? Leaving for now. */ + dw[i++] = preparser_disable(true); + i = emit_flush_invalidate(0, dw, i); + dw[i++] = preparser_disable(false); + + i = emit_bb_start(job->batch_addr[1], BIT(8), dw, i); + + dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | job->migrate_flush_flags | + MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_IMM_DW; + dw[i++] = xe_lrc_seqno_ggtt_addr(lrc) | MI_FLUSH_DW_USE_GTT; + dw[i++] = 0; + dw[i++] = seqno; /* value */ + + i = emit_user_interrupt(dw, i); + + xe_gt_assert(job->q->gt, i <= MAX_JOB_SIZE_DW); + + xe_lrc_write_ring(lrc, dw, i * sizeof(*dw)); +} + +static void emit_job_gen12_gsc(struct xe_sched_job *job) +{ + struct xe_gt *gt = job->q->gt; + + xe_gt_assert(gt, job->q->width <= 1); /* no parallel submission for GSCCS */ + + __emit_job_gen12_simple(job, job->q->lrc, + job->batch_addr[0], + xe_sched_job_seqno(job)); +} + +static void emit_job_gen12_copy(struct xe_sched_job *job) +{ + int i; + + if (xe_sched_job_is_migration(job->q)) { + emit_migration_job_gen12(job, job->q->lrc, + xe_sched_job_seqno(job)); + return; + } + + for (i = 0; i < job->q->width; ++i) + __emit_job_gen12_simple(job, job->q->lrc + i, + job->batch_addr[i], + xe_sched_job_seqno(job)); +} + +static void emit_job_gen12_video(struct xe_sched_job *job) +{ + int i; + + /* FIXME: Not doing parallel handshake for now */ + for (i = 0; i < job->q->width; ++i) + __emit_job_gen12_video(job, job->q->lrc + i, + job->batch_addr[i], + xe_sched_job_seqno(job)); +} + +static void emit_job_gen12_render_compute(struct xe_sched_job *job) +{ + int i; + + for (i = 0; i < job->q->width; ++i) + __emit_job_gen12_render_compute(job, job->q->lrc + i, + job->batch_addr[i], + xe_sched_job_seqno(job)); +} + +static const struct xe_ring_ops ring_ops_gen12_gsc = { + .emit_job = emit_job_gen12_gsc, +}; + +static const struct xe_ring_ops ring_ops_gen12_copy = { + .emit_job = emit_job_gen12_copy, +}; + +static const struct xe_ring_ops ring_ops_gen12_video = { + .emit_job = emit_job_gen12_video, +}; + +static const struct xe_ring_ops ring_ops_gen12_render_compute = { + .emit_job = emit_job_gen12_render_compute, +}; + +const struct xe_ring_ops * +xe_ring_ops_get(struct xe_gt *gt, enum xe_engine_class class) +{ + switch (class) { + case XE_ENGINE_CLASS_OTHER: + return &ring_ops_gen12_gsc; + case XE_ENGINE_CLASS_COPY: + return &ring_ops_gen12_copy; + case XE_ENGINE_CLASS_VIDEO_DECODE: + case XE_ENGINE_CLASS_VIDEO_ENHANCE: + return &ring_ops_gen12_video; + case XE_ENGINE_CLASS_RENDER: + case XE_ENGINE_CLASS_COMPUTE: + return &ring_ops_gen12_render_compute; + default: + return NULL; + } +} diff --git a/drivers/gpu/drm/xe/xe_ring_ops.h b/drivers/gpu/drm/xe/xe_ring_ops.h new file mode 100644 index 000000000000..e942735d76a6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ring_ops.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_RING_OPS_H_ +#define _XE_RING_OPS_H_ + +#include "xe_hw_engine_types.h" +#include "xe_ring_ops_types.h" + +struct xe_gt; + +const struct xe_ring_ops * +xe_ring_ops_get(struct xe_gt *gt, enum xe_engine_class class); + +#endif diff --git a/drivers/gpu/drm/xe/xe_ring_ops_types.h b/drivers/gpu/drm/xe/xe_ring_ops_types.h new file mode 100644 index 000000000000..1ae56e2ee7b4 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ring_ops_types.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_RING_OPS_TYPES_H_ +#define _XE_RING_OPS_TYPES_H_ + +struct xe_sched_job; + +#define MAX_JOB_SIZE_DW 48 +#define MAX_JOB_SIZE_BYTES (MAX_JOB_SIZE_DW * 4) + +/** + * struct xe_ring_ops - Ring operations + */ +struct xe_ring_ops { + /** @emit_job: Write job to ring */ + void (*emit_job)(struct xe_sched_job *job); +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_rtp.c b/drivers/gpu/drm/xe/xe_rtp.c new file mode 100644 index 000000000000..fb44cc7521d8 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_rtp.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_rtp.h" + +#include <kunit/visibility.h> + +#include <drm/xe_drm.h> + +#include "xe_gt.h" +#include "xe_gt_topology.h" +#include "xe_macros.h" +#include "xe_reg_sr.h" + +/** + * DOC: Register Table Processing + * + * Internal infrastructure to define how registers should be updated based on + * rules and actions. This can be used to define tables with multiple entries + * (one per register) that will be walked over at some point in time to apply + * the values to the registers that have matching rules. + */ + +static bool has_samedia(const struct xe_device *xe) +{ + return xe->info.media_verx100 >= 1300; +} + +static bool rule_matches(const struct xe_device *xe, + struct xe_gt *gt, + struct xe_hw_engine *hwe, + const struct xe_rtp_rule *rules, + unsigned int n_rules) +{ + const struct xe_rtp_rule *r; + unsigned int i; + bool match; + + for (r = rules, i = 0; i < n_rules; r = &rules[++i]) { + switch (r->match_type) { + case XE_RTP_MATCH_PLATFORM: + match = xe->info.platform == r->platform; + break; + case XE_RTP_MATCH_SUBPLATFORM: + match = xe->info.platform == r->platform && + xe->info.subplatform == r->subplatform; + break; + case XE_RTP_MATCH_GRAPHICS_VERSION: + match = xe->info.graphics_verx100 == r->ver_start && + (!has_samedia(xe) || !xe_gt_is_media_type(gt)); + break; + case XE_RTP_MATCH_GRAPHICS_VERSION_RANGE: + match = xe->info.graphics_verx100 >= r->ver_start && + xe->info.graphics_verx100 <= r->ver_end && + (!has_samedia(xe) || !xe_gt_is_media_type(gt)); + break; + case XE_RTP_MATCH_GRAPHICS_STEP: + match = xe->info.step.graphics >= r->step_start && + xe->info.step.graphics < r->step_end && + (!has_samedia(xe) || !xe_gt_is_media_type(gt)); + break; + case XE_RTP_MATCH_MEDIA_VERSION: + match = xe->info.media_verx100 == r->ver_start && + (!has_samedia(xe) || xe_gt_is_media_type(gt)); + break; + case XE_RTP_MATCH_MEDIA_VERSION_RANGE: + match = xe->info.media_verx100 >= r->ver_start && + xe->info.media_verx100 <= r->ver_end && + (!has_samedia(xe) || xe_gt_is_media_type(gt)); + break; + case XE_RTP_MATCH_MEDIA_STEP: + match = xe->info.step.media >= r->step_start && + xe->info.step.media < r->step_end && + (!has_samedia(xe) || xe_gt_is_media_type(gt)); + break; + case XE_RTP_MATCH_INTEGRATED: + match = !xe->info.is_dgfx; + break; + case XE_RTP_MATCH_DISCRETE: + match = xe->info.is_dgfx; + break; + case XE_RTP_MATCH_ENGINE_CLASS: + if (drm_WARN_ON(&xe->drm, !hwe)) + return false; + + match = hwe->class == r->engine_class; + break; + case XE_RTP_MATCH_NOT_ENGINE_CLASS: + if (drm_WARN_ON(&xe->drm, !hwe)) + return false; + + match = hwe->class != r->engine_class; + break; + case XE_RTP_MATCH_FUNC: + match = r->match_func(gt, hwe); + break; + default: + drm_warn(&xe->drm, "Invalid RTP match %u\n", + r->match_type); + match = false; + } + + if (!match) + return false; + } + + return true; +} + +static void rtp_add_sr_entry(const struct xe_rtp_action *action, + struct xe_gt *gt, + u32 mmio_base, + struct xe_reg_sr *sr) +{ + struct xe_reg_sr_entry sr_entry = { + .reg = action->reg, + .clr_bits = action->clr_bits, + .set_bits = action->set_bits, + .read_mask = action->read_mask, + }; + + sr_entry.reg.addr += mmio_base; + xe_reg_sr_add(sr, &sr_entry, gt); +} + +static bool rtp_process_one_sr(const struct xe_rtp_entry_sr *entry, + struct xe_device *xe, struct xe_gt *gt, + struct xe_hw_engine *hwe, struct xe_reg_sr *sr) +{ + const struct xe_rtp_action *action; + u32 mmio_base; + unsigned int i; + + if (!rule_matches(xe, gt, hwe, entry->rules, entry->n_rules)) + return false; + + for (i = 0, action = &entry->actions[0]; i < entry->n_actions; action++, i++) { + if ((entry->flags & XE_RTP_ENTRY_FLAG_FOREACH_ENGINE) || + (action->flags & XE_RTP_ACTION_FLAG_ENGINE_BASE)) + mmio_base = hwe->mmio_base; + else + mmio_base = 0; + + rtp_add_sr_entry(action, gt, mmio_base, sr); + } + + return true; +} + +static void rtp_get_context(struct xe_rtp_process_ctx *ctx, + struct xe_hw_engine **hwe, + struct xe_gt **gt, + struct xe_device **xe) +{ + switch (ctx->type) { + case XE_RTP_PROCESS_TYPE_GT: + *hwe = NULL; + *gt = ctx->gt; + *xe = gt_to_xe(*gt); + break; + case XE_RTP_PROCESS_TYPE_ENGINE: + *hwe = ctx->hwe; + *gt = (*hwe)->gt; + *xe = gt_to_xe(*gt); + break; + }; +} + +/** + * xe_rtp_process_ctx_enable_active_tracking - Enable tracking of active entries + * + * Set additional metadata to track what entries are considered "active", i.e. + * their rules match the condition. Bits are never cleared: entries with + * matching rules set the corresponding bit in the bitmap. + * + * @ctx: The context for processing the table + * @active_entries: bitmap to store the active entries + * @n_entries: number of entries to be processed + */ +void xe_rtp_process_ctx_enable_active_tracking(struct xe_rtp_process_ctx *ctx, + unsigned long *active_entries, + size_t n_entries) +{ + ctx->active_entries = active_entries; + ctx->n_entries = n_entries; +} + +static void rtp_mark_active(struct xe_device *xe, + struct xe_rtp_process_ctx *ctx, + unsigned int first, unsigned int last) +{ + if (!ctx->active_entries) + return; + + if (drm_WARN_ON(&xe->drm, last > ctx->n_entries)) + return; + + if (first == last) + bitmap_set(ctx->active_entries, first, 1); + else + bitmap_set(ctx->active_entries, first, last - first + 2); +} + +/** + * xe_rtp_process_to_sr - Process all rtp @entries, adding the matching ones to + * the save-restore argument. + * @ctx: The context for processing the table, with one of device, gt or hwe + * @entries: Table with RTP definitions + * @sr: Save-restore struct where matching rules execute the action. This can be + * viewed as the "coalesced view" of multiple the tables. The bits for each + * register set are expected not to collide with previously added entries + * + * Walk the table pointed by @entries (with an empty sentinel) and add all + * entries with matching rules to @sr. If @hwe is not NULL, its mmio_base is + * used to calculate the right register offset + */ +void xe_rtp_process_to_sr(struct xe_rtp_process_ctx *ctx, + const struct xe_rtp_entry_sr *entries, + struct xe_reg_sr *sr) +{ + const struct xe_rtp_entry_sr *entry; + struct xe_hw_engine *hwe = NULL; + struct xe_gt *gt = NULL; + struct xe_device *xe = NULL; + + rtp_get_context(ctx, &hwe, >, &xe); + + for (entry = entries; entry && entry->name; entry++) { + bool match = false; + + if (entry->flags & XE_RTP_ENTRY_FLAG_FOREACH_ENGINE) { + struct xe_hw_engine *each_hwe; + enum xe_hw_engine_id id; + + for_each_hw_engine(each_hwe, gt, id) + match |= rtp_process_one_sr(entry, xe, gt, + each_hwe, sr); + } else { + match = rtp_process_one_sr(entry, xe, gt, hwe, sr); + } + + if (match) + rtp_mark_active(xe, ctx, entry - entries, + entry - entries); + } +} +EXPORT_SYMBOL_IF_KUNIT(xe_rtp_process_to_sr); + +/** + * xe_rtp_process - Process all rtp @entries, without running any action + * @ctx: The context for processing the table, with one of device, gt or hwe + * @entries: Table with RTP definitions + * + * Walk the table pointed by @entries (with an empty sentinel), executing the + * rules. A few differences from xe_rtp_process_to_sr(): + * + * 1. There is no action associated with each entry since this uses + * struct xe_rtp_entry. Its main use is for marking active workarounds via + * xe_rtp_process_ctx_enable_active_tracking(). + * 2. There is support for OR operations by having entries with no name. + */ +void xe_rtp_process(struct xe_rtp_process_ctx *ctx, + const struct xe_rtp_entry *entries) +{ + const struct xe_rtp_entry *entry, *first_entry; + struct xe_hw_engine *hwe; + struct xe_gt *gt; + struct xe_device *xe; + + rtp_get_context(ctx, &hwe, >, &xe); + + first_entry = entries; + if (drm_WARN_ON(&xe->drm, !first_entry->name)) + return; + + for (entry = entries; entry && entry->rules; entry++) { + if (entry->name) + first_entry = entry; + + if (!rule_matches(xe, gt, hwe, entry->rules, entry->n_rules)) + continue; + + /* Fast-forward entry, eliminating the OR'ed entries */ + for (entry++; entry && entry->rules; entry++) + if (entry->name) + break; + entry--; + + rtp_mark_active(xe, ctx, first_entry - entries, + entry - entries); + } +} + +bool xe_rtp_match_even_instance(const struct xe_gt *gt, + const struct xe_hw_engine *hwe) +{ + return hwe->instance % 2 == 0; +} + +bool xe_rtp_match_first_render_or_compute(const struct xe_gt *gt, + const struct xe_hw_engine *hwe) +{ + u64 render_compute_mask = gt->info.engine_mask & + (XE_HW_ENGINE_CCS_MASK | XE_HW_ENGINE_RCS_MASK); + + return render_compute_mask && + hwe->engine_id == __ffs(render_compute_mask); +} + +bool xe_rtp_match_first_gslice_fused_off(const struct xe_gt *gt, + const struct xe_hw_engine *hwe) +{ + unsigned int dss_per_gslice = 4; + unsigned int dss; + + if (drm_WARN(>_to_xe(gt)->drm, xe_dss_mask_empty(gt->fuse_topo.g_dss_mask), + "Checking gslice for platform without geometry pipeline\n")) + return false; + + dss = xe_dss_mask_group_ffs(gt->fuse_topo.g_dss_mask, 0, 0); + + return dss >= dss_per_gslice; +} diff --git a/drivers/gpu/drm/xe/xe_rtp.h b/drivers/gpu/drm/xe/xe_rtp.h new file mode 100644 index 000000000000..c56fedd126e6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_rtp.h @@ -0,0 +1,430 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_RTP_ +#define _XE_RTP_ + +#include <linux/types.h> +#include <linux/xarray.h> + +#define _XE_RTP_INCLUDE_PRIVATE_HELPERS + +#include "xe_rtp_helpers.h" +#include "xe_rtp_types.h" + +#undef _XE_RTP_INCLUDE_PRIVATE_HELPERS + +/* + * Register table poke infrastructure + */ + +struct xe_hw_engine; +struct xe_gt; +struct xe_reg_sr; + +/* + * Macros to encode rules to match against platform, IP version, stepping, etc. + * Shouldn't be used directly - see XE_RTP_RULES() + */ +#define _XE_RTP_RULE_PLATFORM(plat__) \ + { .match_type = XE_RTP_MATCH_PLATFORM, .platform = plat__ } + +#define _XE_RTP_RULE_SUBPLATFORM(plat__, sub__) \ + { .match_type = XE_RTP_MATCH_SUBPLATFORM, \ + .platform = plat__, .subplatform = sub__ } + +#define _XE_RTP_RULE_GRAPHICS_STEP(start__, end__) \ + { .match_type = XE_RTP_MATCH_GRAPHICS_STEP, \ + .step_start = start__, .step_end = end__ } + +#define _XE_RTP_RULE_MEDIA_STEP(start__, end__) \ + { .match_type = XE_RTP_MATCH_MEDIA_STEP, \ + .step_start = start__, .step_end = end__ } + +#define _XE_RTP_RULE_ENGINE_CLASS(cls__) \ + { .match_type = XE_RTP_MATCH_ENGINE_CLASS, \ + .engine_class = (cls__) } + +/** + * XE_RTP_RULE_PLATFORM - Create rule matching platform + * @plat_: platform to match + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_PLATFORM(plat_) \ + _XE_RTP_RULE_PLATFORM(XE_##plat_) + +/** + * XE_RTP_RULE_SUBPLATFORM - Create rule matching platform and sub-platform + * @plat_: platform to match + * @sub_: sub-platform to match + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_SUBPLATFORM(plat_, sub_) \ + _XE_RTP_RULE_SUBPLATFORM(XE_##plat_, XE_SUBPLATFORM_##plat_##_##sub_) + +/** + * XE_RTP_RULE_GRAPHICS_STEP - Create rule matching graphics stepping + * @start_: First stepping matching the rule + * @end_: First stepping that does not match the rule + * + * Note that the range matching this rule is [ @start_, @end_ ), i.e. inclusive + * on the left, exclusive on the right. + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_GRAPHICS_STEP(start_, end_) \ + _XE_RTP_RULE_GRAPHICS_STEP(STEP_##start_, STEP_##end_) + +/** + * XE_RTP_RULE_MEDIA_STEP - Create rule matching media stepping + * @start_: First stepping matching the rule + * @end_: First stepping that does not match the rule + * + * Note that the range matching this rule is [ @start_, @end_ ), i.e. inclusive + * on the left, exclusive on the right. + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_MEDIA_STEP(start_, end_) \ + _XE_RTP_RULE_MEDIA_STEP(STEP_##start_, STEP_##end_) + +/** + * XE_RTP_RULE_ENGINE_CLASS - Create rule matching an engine class + * @cls_: Engine class to match + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_ENGINE_CLASS(cls_) \ + _XE_RTP_RULE_ENGINE_CLASS(XE_ENGINE_CLASS_##cls_) + +/** + * XE_RTP_RULE_FUNC - Create rule using callback function for match + * @func__: Function to call to decide if rule matches + * + * This allows more complex checks to be performed. The ``XE_RTP`` + * infrastructure will simply call the function @func_ passed to decide if this + * rule matches the device. + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_FUNC(func__) \ + { .match_type = XE_RTP_MATCH_FUNC, \ + .match_func = (func__) } + +/** + * XE_RTP_RULE_GRAPHICS_VERSION - Create rule matching graphics version + * @ver__: Graphics IP version to match + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_GRAPHICS_VERSION(ver__) \ + { .match_type = XE_RTP_MATCH_GRAPHICS_VERSION, \ + .ver_start = ver__, } + +/** + * XE_RTP_RULE_GRAPHICS_VERSION_RANGE - Create rule matching a range of graphics version + * @ver_start__: First graphics IP version to match + * @ver_end__: Last graphics IP version to match + * + * Note that the range matching this rule is [ @ver_start__, @ver_end__ ], i.e. + * inclusive on boths sides + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_GRAPHICS_VERSION_RANGE(ver_start__, ver_end__) \ + { .match_type = XE_RTP_MATCH_GRAPHICS_VERSION_RANGE, \ + .ver_start = ver_start__, .ver_end = ver_end__, } + +/** + * XE_RTP_RULE_MEDIA_VERSION - Create rule matching media version + * @ver__: Graphics IP version to match + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_MEDIA_VERSION(ver__) \ + { .match_type = XE_RTP_MATCH_MEDIA_VERSION, \ + .ver_start = ver__, } + +/** + * XE_RTP_RULE_MEDIA_VERSION_RANGE - Create rule matching a range of media version + * @ver_start__: First media IP version to match + * @ver_end__: Last media IP version to match + * + * Note that the range matching this rule is [ @ver_start__, @ver_end__ ], i.e. + * inclusive on boths sides + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_MEDIA_VERSION_RANGE(ver_start__, ver_end__) \ + { .match_type = XE_RTP_MATCH_MEDIA_VERSION_RANGE, \ + .ver_start = ver_start__, .ver_end = ver_end__, } + +/** + * XE_RTP_RULE_IS_INTEGRATED - Create a rule matching integrated graphics devices + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_IS_INTEGRATED \ + { .match_type = XE_RTP_MATCH_INTEGRATED } + +/** + * XE_RTP_RULE_IS_DISCRETE - Create a rule matching discrete graphics devices + * + * Refer to XE_RTP_RULES() for expected usage. + */ +#define XE_RTP_RULE_IS_DISCRETE \ + { .match_type = XE_RTP_MATCH_DISCRETE } + +/** + * XE_RTP_ACTION_WR - Helper to write a value to the register, overriding all + * the bits + * @reg_: Register + * @val_: Value to set + * @...: Additional fields to override in the struct xe_rtp_action entry + * + * The correspondent notation in bspec is: + * + * REGNAME = VALUE + */ +#define XE_RTP_ACTION_WR(reg_, val_, ...) \ + { .reg = XE_RTP_DROP_CAST(reg_), \ + .clr_bits = ~0u, .set_bits = (val_), \ + .read_mask = (~0u), ##__VA_ARGS__ } + +/** + * XE_RTP_ACTION_SET - Set bits from @val_ in the register. + * @reg_: Register + * @val_: Bits to set in the register + * @...: Additional fields to override in the struct xe_rtp_action entry + * + * For masked registers this translates to a single write, while for other + * registers it's a RMW. The correspondent bspec notation is (example for bits 2 + * and 5, but could be any): + * + * REGNAME[2] = 1 + * REGNAME[5] = 1 + */ +#define XE_RTP_ACTION_SET(reg_, val_, ...) \ + { .reg = XE_RTP_DROP_CAST(reg_), \ + .clr_bits = val_, .set_bits = val_, \ + .read_mask = val_, ##__VA_ARGS__ } + +/** + * XE_RTP_ACTION_CLR: Clear bits from @val_ in the register. + * @reg_: Register + * @val_: Bits to clear in the register + * @...: Additional fields to override in the struct xe_rtp_action entry + * + * For masked registers this translates to a single write, while for other + * registers it's a RMW. The correspondent bspec notation is (example for bits 2 + * and 5, but could be any): + * + * REGNAME[2] = 0 + * REGNAME[5] = 0 + */ +#define XE_RTP_ACTION_CLR(reg_, val_, ...) \ + { .reg = XE_RTP_DROP_CAST(reg_), \ + .clr_bits = val_, .set_bits = 0, \ + .read_mask = val_, ##__VA_ARGS__ } + +/** + * XE_RTP_ACTION_FIELD_SET: Set a bit range + * @reg_: Register + * @mask_bits_: Mask of bits to be changed in the register, forming a field + * @val_: Value to set in the field denoted by @mask_bits_ + * @...: Additional fields to override in the struct xe_rtp_action entry + * + * For masked registers this translates to a single write, while for other + * registers it's a RMW. The correspondent bspec notation is: + * + * REGNAME[<end>:<start>] = VALUE + */ +#define XE_RTP_ACTION_FIELD_SET(reg_, mask_bits_, val_, ...) \ + { .reg = XE_RTP_DROP_CAST(reg_), \ + .clr_bits = mask_bits_, .set_bits = val_, \ + .read_mask = mask_bits_, ##__VA_ARGS__ } + +#define XE_RTP_ACTION_FIELD_SET_NO_READ_MASK(reg_, mask_bits_, val_, ...) \ + { .reg = XE_RTP_DROP_CAST(reg_), \ + .clr_bits = (mask_bits_), .set_bits = (val_), \ + .read_mask = 0, ##__VA_ARGS__ } + +/** + * XE_RTP_ACTION_WHITELIST - Add register to userspace whitelist + * @reg_: Register + * @val_: Whitelist-specific flags to set + * @...: Additional fields to override in the struct xe_rtp_action entry + * + * Add a register to the whitelist, allowing userspace to modify the ster with + * regular user privileges. + */ +#define XE_RTP_ACTION_WHITELIST(reg_, val_, ...) \ + /* TODO fail build if ((flags) & ~(RING_FORCE_TO_NONPRIV_MASK_VALID)) */\ + { .reg = XE_RTP_DROP_CAST(reg_), \ + .set_bits = val_, \ + .clr_bits = RING_FORCE_TO_NONPRIV_MASK_VALID, \ + ##__VA_ARGS__ } + +/** + * XE_RTP_NAME - Helper to set the name in xe_rtp_entry + * @s_: Name describing this rule, often a HW-specific number + * + * TODO: maybe move this behind a debug config? + */ +#define XE_RTP_NAME(s_) .name = (s_) + +/** + * XE_RTP_ENTRY_FLAG - Helper to add multiple flags to a struct xe_rtp_entry_sr + * @...: Entry flags, without the ``XE_RTP_ENTRY_FLAG_`` prefix + * + * Helper to automatically add a ``XE_RTP_ENTRY_FLAG_`` prefix to the flags + * when defining struct xe_rtp_entry entries. Example: + * + * .. code-block:: c + * + * const struct xe_rtp_entry_sr wa_entries[] = { + * ... + * { XE_RTP_NAME("test-entry"), + * ... + * XE_RTP_ENTRY_FLAG(FOREACH_ENGINE), + * ... + * }, + * ... + * }; + */ +#define XE_RTP_ENTRY_FLAG(...) \ + .flags = (XE_RTP_PASTE_FOREACH(ENTRY_FLAG_, BITWISE_OR, (__VA_ARGS__))) + +/** + * XE_RTP_ACTION_FLAG - Helper to add multiple flags to a struct xe_rtp_action + * @...: Action flags, without the ``XE_RTP_ACTION_FLAG_`` prefix + * + * Helper to automatically add a ``XE_RTP_ACTION_FLAG_`` prefix to the flags + * when defining struct xe_rtp_action entries. Example: + * + * .. code-block:: c + * + * const struct xe_rtp_entry_sr wa_entries[] = { + * ... + * { XE_RTP_NAME("test-entry"), + * ... + * XE_RTP_ACTION_SET(..., XE_RTP_ACTION_FLAG(FOREACH_ENGINE)), + * ... + * }, + * ... + * }; + */ +#define XE_RTP_ACTION_FLAG(...) \ + .flags = (XE_RTP_PASTE_FOREACH(ACTION_FLAG_, BITWISE_OR, (__VA_ARGS__))) + +/** + * XE_RTP_RULES - Helper to set multiple rules to a struct xe_rtp_entry_sr entry + * @...: Rules + * + * At least one rule is needed and up to 4 are supported. Multiple rules are + * AND'ed together, i.e. all the rules must evaluate to true for the entry to + * be processed. See XE_RTP_MATCH_* for the possible match rules. Example: + * + * .. code-block:: c + * + * const struct xe_rtp_entry_sr wa_entries[] = { + * ... + * { XE_RTP_NAME("test-entry"), + * XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + * ... + * }, + * ... + * }; + */ +#define XE_RTP_RULES(...) \ + .n_rules = _XE_COUNT_ARGS(__VA_ARGS__), \ + .rules = (const struct xe_rtp_rule[]) { \ + XE_RTP_PASTE_FOREACH(RULE_, COMMA, (__VA_ARGS__)) \ + } + +/** + * XE_RTP_ACTIONS - Helper to set multiple actions to a struct xe_rtp_entry_sr + * @...: Actions to be taken + * + * At least one action is needed and up to 4 are supported. See XE_RTP_ACTION_* + * for the possible actions. Example: + * + * .. code-block:: c + * + * const struct xe_rtp_entry_sr wa_entries[] = { + * ... + * { XE_RTP_NAME("test-entry"), + * XE_RTP_RULES(...), + * XE_RTP_ACTIONS(SET(..), SET(...), CLR(...)), + * ... + * }, + * ... + * }; + */ +#define XE_RTP_ACTIONS(...) \ + .n_actions = _XE_COUNT_ARGS(__VA_ARGS__), \ + .actions = (const struct xe_rtp_action[]) { \ + XE_RTP_PASTE_FOREACH(ACTION_, COMMA, (__VA_ARGS__)) \ + } + +#define XE_RTP_PROCESS_CTX_INITIALIZER(arg__) _Generic((arg__), \ + struct xe_hw_engine * : (struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_ENGINE }, \ + struct xe_gt * : (struct xe_rtp_process_ctx){ { (void *)(arg__) }, XE_RTP_PROCESS_TYPE_GT }) + +void xe_rtp_process_ctx_enable_active_tracking(struct xe_rtp_process_ctx *ctx, + unsigned long *active_entries, + size_t n_entries); + +void xe_rtp_process_to_sr(struct xe_rtp_process_ctx *ctx, + const struct xe_rtp_entry_sr *entries, + struct xe_reg_sr *sr); + +void xe_rtp_process(struct xe_rtp_process_ctx *ctx, + const struct xe_rtp_entry *entries); + +/* Match functions to be used with XE_RTP_MATCH_FUNC */ + +/** + * xe_rtp_match_even_instance - Match if engine instance is even + * @gt: GT structure + * @hwe: Engine instance + * + * Returns: true if engine instance is even, false otherwise + */ +bool xe_rtp_match_even_instance(const struct xe_gt *gt, + const struct xe_hw_engine *hwe); + +/* + * xe_rtp_match_first_render_or_compute - Match if it's first render or compute + * engine in the GT + * + * @gt: GT structure + * @hwe: Engine instance + * + * Registers on the render reset domain need to have their values re-applied + * when any of those engines are reset. Since the engines reset together, a + * programming can be set to just one of them. For simplicity the first engine + * of either render or compute class can be chosen. + * + * Returns: true if engine id is the first to match the render reset domain, + * false otherwise. + */ +bool xe_rtp_match_first_render_or_compute(const struct xe_gt *gt, + const struct xe_hw_engine *hwe); + +/* + * xe_rtp_match_first_gslice_fused_off - Match when first gslice is fused off + * + * @gt: GT structure + * @hwe: Engine instance + * + * Returns: true if first gslice is fused off, false otherwise. + */ +bool xe_rtp_match_first_gslice_fused_off(const struct xe_gt *gt, + const struct xe_hw_engine *hwe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_rtp_helpers.h b/drivers/gpu/drm/xe/xe_rtp_helpers.h new file mode 100644 index 000000000000..181b6290fac3 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_rtp_helpers.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_RTP_HELPERS_ +#define _XE_RTP_HELPERS_ + +#ifndef _XE_RTP_INCLUDE_PRIVATE_HELPERS +#error "This header is supposed to be included by xe_rtp.h only" +#endif + +/* + * Helper macros - not to be used outside this header. + */ +#define _XE_ESC(...) __VA_ARGS__ +#define _XE_COUNT_ARGS(...) _XE_ESC(__XE_COUNT_ARGS(__VA_ARGS__, 5, 4, 3, 2, 1,)) +#define __XE_COUNT_ARGS(_, _5, _4, _3, _2, X_, ...) X_ + +#define _XE_FIRST(...) _XE_ESC(__XE_FIRST(__VA_ARGS__,)) +#define __XE_FIRST(x_, ...) x_ +#define _XE_TUPLE_TAIL(...) _XE_ESC(__XE_TUPLE_TAIL(__VA_ARGS__)) +#define __XE_TUPLE_TAIL(x_, ...) (__VA_ARGS__) + +#define _XE_DROP_FIRST(x_, ...) __VA_ARGS__ + +#define _XE_RTP_CONCAT(a, b) __XE_RTP_CONCAT(a, b) +#define __XE_RTP_CONCAT(a, b) XE_RTP_ ## a ## b + +#define __XE_RTP_PASTE_SEP_COMMA , +#define __XE_RTP_PASTE_SEP_BITWISE_OR | + +/* + * XE_RTP_PASTE_FOREACH - Paste XE_RTP_<@prefix_> on each element of the tuple + * @args, with the end result separated by @sep_. @sep must be one of the + * previously declared macros __XE_RTP_PASTE_SEP_*, or declared with such + * prefix. + * + * Examples: + * + * 1) XE_RTP_PASTE_FOREACH(TEST_, COMMA, (FOO, BAR)) + * expands to: + * + * XE_RTP_TEST_FOO , XE_RTP_TEST_BAR + * + * 2) XE_RTP_PASTE_FOREACH(TEST2_, COMMA, (FOO)) + * expands to: + * + * XE_RTP_TEST2_FOO + * + * 3) XE_RTP_PASTE_FOREACH(TEST3, BITWISE_OR, (FOO, BAR)) + * expands to: + * + * XE_RTP_TEST3_FOO | XE_RTP_TEST3_BAR + * + * 4) #define __XE_RTP_PASTE_SEP_MY_SEP BANANA + * XE_RTP_PASTE_FOREACH(TEST_, MY_SEP, (FOO, BAR)) + * expands to: + * + * XE_RTP_TEST_FOO BANANA XE_RTP_TEST_BAR + */ +#define XE_RTP_PASTE_FOREACH(prefix_, sep_, args_) _XE_ESC(_XE_RTP_CONCAT(PASTE_, _XE_COUNT_ARGS args_)(prefix_, sep_, args_)) +#define XE_RTP_PASTE_1(prefix_, sep_, args_) _XE_RTP_CONCAT(prefix_, _XE_FIRST args_) +#define XE_RTP_PASTE_2(prefix_, sep_, args_) _XE_RTP_CONCAT(prefix_, _XE_FIRST args_) __XE_RTP_PASTE_SEP_ ## sep_ XE_RTP_PASTE_1(prefix_, sep_, _XE_TUPLE_TAIL args_) +#define XE_RTP_PASTE_3(prefix_, sep_, args_) _XE_RTP_CONCAT(prefix_, _XE_FIRST args_) __XE_RTP_PASTE_SEP_ ## sep_ XE_RTP_PASTE_2(prefix_, sep_, _XE_TUPLE_TAIL args_) +#define XE_RTP_PASTE_4(prefix_, sep_, args_) _XE_RTP_CONCAT(prefix_, _XE_FIRST args_) __XE_RTP_PASTE_SEP_ ## sep_ XE_RTP_PASTE_3(prefix_, sep_, _XE_TUPLE_TAIL args_) + +/* + * XE_RTP_DROP_CAST - Drop cast to convert a compound statement to a initializer + * + * Example: + * + * #define foo(a_) ((struct foo){ .a = a_ }) + * XE_RTP_DROP_CAST(foo(10)) + * expands to: + * + * { .a = 10 } + */ +#define XE_RTP_DROP_CAST(...) _XE_ESC(_XE_DROP_FIRST _XE_ESC __VA_ARGS__) + +#endif diff --git a/drivers/gpu/drm/xe/xe_rtp_types.h b/drivers/gpu/drm/xe/xe_rtp_types.h new file mode 100644 index 000000000000..637acc7626a4 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_rtp_types.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_RTP_TYPES_ +#define _XE_RTP_TYPES_ + +#include <linux/types.h> + +#include "regs/xe_reg_defs.h" + +struct xe_hw_engine; +struct xe_gt; + +/** + * struct xe_rtp_action - action to take for any matching rule + * + * This struct records what action should be taken in a register that has a + * matching rule. Example of actions: set/clear bits. + */ +struct xe_rtp_action { + /** @reg: Register */ + struct xe_reg reg; + /** + * @clr_bits: bits to clear when updating register. It's always a + * superset of bits being modified + */ + u32 clr_bits; + /** @set_bits: bits to set when updating register */ + u32 set_bits; +#define XE_RTP_NOCHECK .read_mask = 0 + /** @read_mask: mask for bits to consider when reading value back */ + u32 read_mask; +#define XE_RTP_ACTION_FLAG_ENGINE_BASE BIT(0) + /** @flags: flags to apply on rule evaluation or action */ + u8 flags; +}; + +enum { + XE_RTP_MATCH_PLATFORM, + XE_RTP_MATCH_SUBPLATFORM, + XE_RTP_MATCH_GRAPHICS_VERSION, + XE_RTP_MATCH_GRAPHICS_VERSION_RANGE, + XE_RTP_MATCH_GRAPHICS_STEP, + XE_RTP_MATCH_MEDIA_VERSION, + XE_RTP_MATCH_MEDIA_VERSION_RANGE, + XE_RTP_MATCH_MEDIA_STEP, + XE_RTP_MATCH_INTEGRATED, + XE_RTP_MATCH_DISCRETE, + XE_RTP_MATCH_ENGINE_CLASS, + XE_RTP_MATCH_NOT_ENGINE_CLASS, + XE_RTP_MATCH_FUNC, +}; + +/** struct xe_rtp_rule - match rule for processing entry */ +struct xe_rtp_rule { + u8 match_type; + + /* match filters */ + union { + /* MATCH_PLATFORM / MATCH_SUBPLATFORM */ + struct { + u8 platform; + u8 subplatform; + }; + /* + * MATCH_GRAPHICS_VERSION / XE_RTP_MATCH_GRAPHICS_VERSION_RANGE / + * MATCH_MEDIA_VERSION / XE_RTP_MATCH_MEDIA_VERSION_RANGE + */ + struct { + u32 ver_start; +#define XE_RTP_END_VERSION_UNDEFINED U32_MAX + u32 ver_end; + }; + /* MATCH_STEP */ + struct { + u8 step_start; + u8 step_end; + }; + /* MATCH_ENGINE_CLASS / MATCH_NOT_ENGINE_CLASS */ + struct { + u8 engine_class; + }; + /* MATCH_FUNC */ + bool (*match_func)(const struct xe_gt *gt, + const struct xe_hw_engine *hwe); + }; +}; + +/** struct xe_rtp_entry_sr - Entry in an rtp table */ +struct xe_rtp_entry_sr { + const char *name; + const struct xe_rtp_action *actions; + const struct xe_rtp_rule *rules; + u8 n_rules; + u8 n_actions; +#define XE_RTP_ENTRY_FLAG_FOREACH_ENGINE BIT(0) + u8 flags; +}; + +/** struct xe_rtp_entry - Entry in an rtp table, with no action associated */ +struct xe_rtp_entry { + const char *name; + const struct xe_rtp_rule *rules; + u8 n_rules; +}; + +enum xe_rtp_process_type { + XE_RTP_PROCESS_TYPE_GT, + XE_RTP_PROCESS_TYPE_ENGINE, +}; + +struct xe_rtp_process_ctx { + union { + struct xe_gt *gt; + struct xe_hw_engine *hwe; + }; + enum xe_rtp_process_type type; + unsigned long *active_entries; + size_t n_entries; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_sa.c b/drivers/gpu/drm/xe/xe_sa.c new file mode 100644 index 000000000000..2c4632259edd --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sa.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_sa.h" + +#include <linux/kernel.h> + +#include <drm/drm_managed.h> + +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_map.h" + +static void xe_sa_bo_manager_fini(struct drm_device *drm, void *arg) +{ + struct xe_sa_manager *sa_manager = arg; + struct xe_bo *bo = sa_manager->bo; + + if (!bo) { + drm_err(drm, "no bo for sa manager\n"); + return; + } + + drm_suballoc_manager_fini(&sa_manager->base); + + if (bo->vmap.is_iomem) + kvfree(sa_manager->cpu_ptr); + + xe_bo_unpin_map_no_vm(bo); + sa_manager->bo = NULL; +} + +struct xe_sa_manager *xe_sa_bo_manager_init(struct xe_tile *tile, u32 size, u32 align) +{ + struct xe_device *xe = tile_to_xe(tile); + u32 managed_size = size - SZ_4K; + struct xe_bo *bo; + int ret; + + struct xe_sa_manager *sa_manager = drmm_kzalloc(&tile_to_xe(tile)->drm, + sizeof(*sa_manager), + GFP_KERNEL); + if (!sa_manager) + return ERR_PTR(-ENOMEM); + + sa_manager->bo = NULL; + + bo = xe_bo_create_pin_map(xe, tile, NULL, size, ttm_bo_type_kernel, + XE_BO_CREATE_VRAM_IF_DGFX(tile) | + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) { + drm_err(&xe->drm, "failed to allocate bo for sa manager: %ld\n", + PTR_ERR(bo)); + return (struct xe_sa_manager *)bo; + } + sa_manager->bo = bo; + + drm_suballoc_manager_init(&sa_manager->base, managed_size, align); + sa_manager->gpu_addr = xe_bo_ggtt_addr(bo); + + if (bo->vmap.is_iomem) { + sa_manager->cpu_ptr = kvzalloc(managed_size, GFP_KERNEL); + if (!sa_manager->cpu_ptr) { + xe_bo_unpin_map_no_vm(sa_manager->bo); + sa_manager->bo = NULL; + return ERR_PTR(-ENOMEM); + } + } else { + sa_manager->cpu_ptr = bo->vmap.vaddr; + memset(sa_manager->cpu_ptr, 0, bo->ttm.base.size); + } + + ret = drmm_add_action_or_reset(&xe->drm, xe_sa_bo_manager_fini, + sa_manager); + if (ret) + return ERR_PTR(ret); + + return sa_manager; +} + +struct drm_suballoc *xe_sa_bo_new(struct xe_sa_manager *sa_manager, + unsigned int size) +{ + return drm_suballoc_new(&sa_manager->base, size, GFP_KERNEL, true, 0); +} + +void xe_sa_bo_flush_write(struct drm_suballoc *sa_bo) +{ + struct xe_sa_manager *sa_manager = to_xe_sa_manager(sa_bo->manager); + struct xe_device *xe = tile_to_xe(sa_manager->bo->tile); + + if (!sa_manager->bo->vmap.is_iomem) + return; + + xe_map_memcpy_to(xe, &sa_manager->bo->vmap, drm_suballoc_soffset(sa_bo), + xe_sa_bo_cpu_addr(sa_bo), + drm_suballoc_size(sa_bo)); +} + +void xe_sa_bo_free(struct drm_suballoc *sa_bo, + struct dma_fence *fence) +{ + drm_suballoc_free(sa_bo, fence); +} diff --git a/drivers/gpu/drm/xe/xe_sa.h b/drivers/gpu/drm/xe/xe_sa.h new file mode 100644 index 000000000000..4e96483057d7 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sa.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ +#ifndef _XE_SA_H_ +#define _XE_SA_H_ + +#include "xe_sa_types.h" + +struct dma_fence; +struct xe_bo; +struct xe_tile; + +struct xe_sa_manager *xe_sa_bo_manager_init(struct xe_tile *tile, u32 size, u32 align); + +struct drm_suballoc *xe_sa_bo_new(struct xe_sa_manager *sa_manager, + u32 size); +void xe_sa_bo_flush_write(struct drm_suballoc *sa_bo); +void xe_sa_bo_free(struct drm_suballoc *sa_bo, + struct dma_fence *fence); + +static inline struct xe_sa_manager * +to_xe_sa_manager(struct drm_suballoc_manager *mng) +{ + return container_of(mng, struct xe_sa_manager, base); +} + +static inline u64 xe_sa_bo_gpu_addr(struct drm_suballoc *sa) +{ + return to_xe_sa_manager(sa->manager)->gpu_addr + + drm_suballoc_soffset(sa); +} + +static inline void *xe_sa_bo_cpu_addr(struct drm_suballoc *sa) +{ + return to_xe_sa_manager(sa->manager)->cpu_ptr + + drm_suballoc_soffset(sa); +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_sa_types.h b/drivers/gpu/drm/xe/xe_sa_types.h new file mode 100644 index 000000000000..2ef896aeca1d --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sa_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ +#ifndef _XE_SA_TYPES_H_ +#define _XE_SA_TYPES_H_ + +#include <drm/drm_suballoc.h> + +struct xe_bo; + +struct xe_sa_manager { + struct drm_suballoc_manager base; + struct xe_bo *bo; + u64 gpu_addr; + void *cpu_ptr; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c new file mode 100644 index 000000000000..01106a1156ad --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_sched_job.h" + +#include <linux/dma-fence-array.h> +#include <linux/slab.h> + +#include "xe_device.h" +#include "xe_exec_queue.h" +#include "xe_gt.h" +#include "xe_hw_engine_types.h" +#include "xe_hw_fence.h" +#include "xe_lrc.h" +#include "xe_macros.h" +#include "xe_trace.h" +#include "xe_vm.h" + +static struct kmem_cache *xe_sched_job_slab; +static struct kmem_cache *xe_sched_job_parallel_slab; + +int __init xe_sched_job_module_init(void) +{ + xe_sched_job_slab = + kmem_cache_create("xe_sched_job", + sizeof(struct xe_sched_job) + + sizeof(u64), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!xe_sched_job_slab) + return -ENOMEM; + + xe_sched_job_parallel_slab = + kmem_cache_create("xe_sched_job_parallel", + sizeof(struct xe_sched_job) + + sizeof(u64) * + XE_HW_ENGINE_MAX_INSTANCE, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!xe_sched_job_parallel_slab) { + kmem_cache_destroy(xe_sched_job_slab); + return -ENOMEM; + } + + return 0; +} + +void xe_sched_job_module_exit(void) +{ + kmem_cache_destroy(xe_sched_job_slab); + kmem_cache_destroy(xe_sched_job_parallel_slab); +} + +static struct xe_sched_job *job_alloc(bool parallel) +{ + return kmem_cache_zalloc(parallel ? xe_sched_job_parallel_slab : + xe_sched_job_slab, GFP_KERNEL); +} + +bool xe_sched_job_is_migration(struct xe_exec_queue *q) +{ + return q->vm && (q->vm->flags & XE_VM_FLAG_MIGRATION); +} + +static void job_free(struct xe_sched_job *job) +{ + struct xe_exec_queue *q = job->q; + bool is_migration = xe_sched_job_is_migration(q); + + kmem_cache_free(xe_exec_queue_is_parallel(job->q) || is_migration ? + xe_sched_job_parallel_slab : xe_sched_job_slab, job); +} + +static struct xe_device *job_to_xe(struct xe_sched_job *job) +{ + return gt_to_xe(job->q->gt); +} + +struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q, + u64 *batch_addr) +{ + struct xe_sched_job *job; + struct dma_fence **fences; + bool is_migration = xe_sched_job_is_migration(q); + int err; + int i, j; + u32 width; + + /* only a kernel context can submit a vm-less job */ + XE_WARN_ON(!q->vm && !(q->flags & EXEC_QUEUE_FLAG_KERNEL)); + + /* Migration and kernel engines have their own locking */ + if (!(q->flags & (EXEC_QUEUE_FLAG_KERNEL | EXEC_QUEUE_FLAG_VM))) { + lockdep_assert_held(&q->vm->lock); + if (!xe_vm_in_lr_mode(q->vm)) + xe_vm_assert_held(q->vm); + } + + job = job_alloc(xe_exec_queue_is_parallel(q) || is_migration); + if (!job) + return ERR_PTR(-ENOMEM); + + job->q = q; + kref_init(&job->refcount); + xe_exec_queue_get(job->q); + + err = drm_sched_job_init(&job->drm, q->entity, 1, NULL); + if (err) + goto err_free; + + if (!xe_exec_queue_is_parallel(q)) { + job->fence = xe_lrc_create_seqno_fence(q->lrc); + if (IS_ERR(job->fence)) { + err = PTR_ERR(job->fence); + goto err_sched_job; + } + } else { + struct dma_fence_array *cf; + + fences = kmalloc_array(q->width, sizeof(*fences), GFP_KERNEL); + if (!fences) { + err = -ENOMEM; + goto err_sched_job; + } + + for (j = 0; j < q->width; ++j) { + fences[j] = xe_lrc_create_seqno_fence(q->lrc + j); + if (IS_ERR(fences[j])) { + err = PTR_ERR(fences[j]); + goto err_fences; + } + } + + cf = dma_fence_array_create(q->width, fences, + q->parallel.composite_fence_ctx, + q->parallel.composite_fence_seqno++, + false); + if (!cf) { + --q->parallel.composite_fence_seqno; + err = -ENOMEM; + goto err_fences; + } + + /* Sanity check */ + for (j = 0; j < q->width; ++j) + xe_assert(job_to_xe(job), cf->base.seqno == fences[j]->seqno); + + job->fence = &cf->base; + } + + width = q->width; + if (is_migration) + width = 2; + + for (i = 0; i < width; ++i) + job->batch_addr[i] = batch_addr[i]; + + /* All other jobs require a VM to be open which has a ref */ + if (unlikely(q->flags & EXEC_QUEUE_FLAG_KERNEL)) + xe_device_mem_access_get(job_to_xe(job)); + xe_device_assert_mem_access(job_to_xe(job)); + + trace_xe_sched_job_create(job); + return job; + +err_fences: + for (j = j - 1; j >= 0; --j) { + --q->lrc[j].fence_ctx.next_seqno; + dma_fence_put(fences[j]); + } + kfree(fences); +err_sched_job: + drm_sched_job_cleanup(&job->drm); +err_free: + xe_exec_queue_put(q); + job_free(job); + return ERR_PTR(err); +} + +/** + * xe_sched_job_destroy - Destroy XE schedule job + * @ref: reference to XE schedule job + * + * Called when ref == 0, drop a reference to job's xe_engine + fence, cleanup + * base DRM schedule job, and free memory for XE schedule job. + */ +void xe_sched_job_destroy(struct kref *ref) +{ + struct xe_sched_job *job = + container_of(ref, struct xe_sched_job, refcount); + + if (unlikely(job->q->flags & EXEC_QUEUE_FLAG_KERNEL)) + xe_device_mem_access_put(job_to_xe(job)); + xe_exec_queue_put(job->q); + dma_fence_put(job->fence); + drm_sched_job_cleanup(&job->drm); + job_free(job); +} + +void xe_sched_job_set_error(struct xe_sched_job *job, int error) +{ + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) + return; + + dma_fence_set_error(job->fence, error); + + if (dma_fence_is_array(job->fence)) { + struct dma_fence_array *array = + to_dma_fence_array(job->fence); + struct dma_fence **child = array->fences; + unsigned int nchild = array->num_fences; + + do { + struct dma_fence *current_fence = *child++; + + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, + ¤t_fence->flags)) + continue; + dma_fence_set_error(current_fence, error); + } while (--nchild); + } + + trace_xe_sched_job_set_error(job); + + dma_fence_enable_sw_signaling(job->fence); + xe_hw_fence_irq_run(job->q->fence_irq); +} + +bool xe_sched_job_started(struct xe_sched_job *job) +{ + struct xe_lrc *lrc = job->q->lrc; + + return !__dma_fence_is_later(xe_sched_job_seqno(job), + xe_lrc_start_seqno(lrc), + job->fence->ops); +} + +bool xe_sched_job_completed(struct xe_sched_job *job) +{ + struct xe_lrc *lrc = job->q->lrc; + + /* + * Can safely check just LRC[0] seqno as that is last seqno written when + * parallel handshake is done. + */ + + return !__dma_fence_is_later(xe_sched_job_seqno(job), xe_lrc_seqno(lrc), + job->fence->ops); +} + +void xe_sched_job_arm(struct xe_sched_job *job) +{ + drm_sched_job_arm(&job->drm); +} + +void xe_sched_job_push(struct xe_sched_job *job) +{ + xe_sched_job_get(job); + trace_xe_sched_job_exec(job); + drm_sched_entity_push_job(&job->drm); + xe_sched_job_put(job); +} + +/** + * xe_sched_job_last_fence_add_dep - Add last fence dependency to job + * @job:job to add the last fence dependency to + * @vm: virtual memory job belongs to + * + * Returns: + * 0 on success, or an error on failing to expand the array. + */ +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm) +{ + struct dma_fence *fence; + + fence = xe_exec_queue_last_fence_get(job->q, vm); + dma_fence_get(fence); + + return drm_sched_job_add_dependency(&job->drm, fence); +} diff --git a/drivers/gpu/drm/xe/xe_sched_job.h b/drivers/gpu/drm/xe/xe_sched_job.h new file mode 100644 index 000000000000..34f475ba7f50 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sched_job.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_SCHED_JOB_H_ +#define _XE_SCHED_JOB_H_ + +#include "xe_sched_job_types.h" + +struct xe_vm; + +#define XE_SCHED_HANG_LIMIT 1 +#define XE_SCHED_JOB_TIMEOUT LONG_MAX + +int xe_sched_job_module_init(void); +void xe_sched_job_module_exit(void); + +struct xe_sched_job *xe_sched_job_create(struct xe_exec_queue *q, + u64 *batch_addr); +void xe_sched_job_destroy(struct kref *ref); + +/** + * xe_sched_job_get - get reference to XE schedule job + * @job: XE schedule job object + * + * Increment XE schedule job's reference count + */ +static inline struct xe_sched_job *xe_sched_job_get(struct xe_sched_job *job) +{ + kref_get(&job->refcount); + return job; +} + +/** + * xe_sched_job_put - put reference to XE schedule job + * @job: XE schedule job object + * + * Decrement XE schedule job's reference count, call xe_sched_job_destroy when + * reference count == 0. + */ +static inline void xe_sched_job_put(struct xe_sched_job *job) +{ + kref_put(&job->refcount, xe_sched_job_destroy); +} + +void xe_sched_job_set_error(struct xe_sched_job *job, int error); +static inline bool xe_sched_job_is_error(struct xe_sched_job *job) +{ + return job->fence->error < 0; +} + +bool xe_sched_job_started(struct xe_sched_job *job); +bool xe_sched_job_completed(struct xe_sched_job *job); + +void xe_sched_job_arm(struct xe_sched_job *job); +void xe_sched_job_push(struct xe_sched_job *job); + +int xe_sched_job_last_fence_add_dep(struct xe_sched_job *job, struct xe_vm *vm); + +static inline struct xe_sched_job * +to_xe_sched_job(struct drm_sched_job *drm) +{ + return container_of(drm, struct xe_sched_job, drm); +} + +static inline u32 xe_sched_job_seqno(struct xe_sched_job *job) +{ + return job->fence->seqno; +} + +static inline void +xe_sched_job_add_migrate_flush(struct xe_sched_job *job, u32 flags) +{ + job->migrate_flush_flags = flags; +} + +bool xe_sched_job_is_migration(struct xe_exec_queue *q); + +#endif diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h new file mode 100644 index 000000000000..71213ba9735b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_SCHED_JOB_TYPES_H_ +#define _XE_SCHED_JOB_TYPES_H_ + +#include <linux/kref.h> + +#include <drm/gpu_scheduler.h> + +struct xe_exec_queue; + +/** + * struct xe_sched_job - XE schedule job (batch buffer tracking) + */ +struct xe_sched_job { + /** @drm: base DRM scheduler job */ + struct drm_sched_job drm; + /** @q: Exec queue */ + struct xe_exec_queue *q; + /** @refcount: ref count of this job */ + struct kref refcount; + /** + * @fence: dma fence to indicate completion. 1 way relationship - job + * can safely reference fence, fence cannot safely reference job. + */ +#define JOB_FLAG_SUBMIT DMA_FENCE_FLAG_USER_BITS + struct dma_fence *fence; + /** @user_fence: write back value when BB is complete */ + struct { + /** @used: user fence is used */ + bool used; + /** @addr: address to write to */ + u64 addr; + /** @value: write back value */ + u64 value; + } user_fence; + /** @migrate_flush_flags: Additional flush flags for migration jobs */ + u32 migrate_flush_flags; + /** @batch_addr: batch buffer address of job */ + u64 batch_addr[]; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_sriov.c b/drivers/gpu/drm/xe/xe_sriov.c new file mode 100644 index 000000000000..42a0e0c917a0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sriov.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include "xe_assert.h" +#include "xe_sriov.h" + +/** + * xe_sriov_mode_to_string - Convert enum value to string. + * @mode: the &xe_sriov_mode to convert + * + * Returns: SR-IOV mode as a user friendly string. + */ +const char *xe_sriov_mode_to_string(enum xe_sriov_mode mode) +{ + switch (mode) { + case XE_SRIOV_MODE_NONE: + return "none"; + case XE_SRIOV_MODE_PF: + return "SR-IOV PF"; + case XE_SRIOV_MODE_VF: + return "SR-IOV VF"; + default: + return "<invalid>"; + } +} + +/** + * xe_sriov_probe_early - Probe a SR-IOV mode. + * @xe: the &xe_device to probe mode on + * @has_sriov: flag indicating hardware support for SR-IOV + * + * This function should be called only once and as soon as possible during + * driver probe to detect whether we are running a SR-IOV Physical Function + * (PF) or a Virtual Function (VF) device. + * + * SR-IOV PF mode detection is based on PCI @dev_is_pf() function. + * SR-IOV VF mode detection is based on dedicated MMIO register read. + */ +void xe_sriov_probe_early(struct xe_device *xe, bool has_sriov) +{ + enum xe_sriov_mode mode = XE_SRIOV_MODE_NONE; + + /* TODO: replace with proper mode detection */ + xe_assert(xe, !has_sriov); + + xe_assert(xe, !xe->sriov.__mode); + xe->sriov.__mode = mode; + xe_assert(xe, xe->sriov.__mode); + + if (has_sriov) + drm_info(&xe->drm, "Running in %s mode\n", + xe_sriov_mode_to_string(xe_device_sriov_mode(xe))); +} diff --git a/drivers/gpu/drm/xe/xe_sriov.h b/drivers/gpu/drm/xe/xe_sriov.h new file mode 100644 index 000000000000..5af73a3172b0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sriov.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_SRIOV_H_ +#define _XE_SRIOV_H_ + +#include "xe_assert.h" +#include "xe_device_types.h" +#include "xe_sriov_types.h" + +const char *xe_sriov_mode_to_string(enum xe_sriov_mode mode); + +void xe_sriov_probe_early(struct xe_device *xe, bool has_sriov); + +static inline enum xe_sriov_mode xe_device_sriov_mode(struct xe_device *xe) +{ + xe_assert(xe, xe->sriov.__mode); + return xe->sriov.__mode; +} + +static inline bool xe_device_is_sriov_pf(struct xe_device *xe) +{ + return xe_device_sriov_mode(xe) == XE_SRIOV_MODE_PF; +} + +static inline bool xe_device_is_sriov_vf(struct xe_device *xe) +{ + return xe_device_sriov_mode(xe) == XE_SRIOV_MODE_VF; +} + +#ifdef CONFIG_PCI_IOV +#define IS_SRIOV_PF(xe) xe_device_is_sriov_pf(xe) +#else +#define IS_SRIOV_PF(xe) (typecheck(struct xe_device *, (xe)) && false) +#endif +#define IS_SRIOV_VF(xe) xe_device_is_sriov_vf(xe) + +#define IS_SRIOV(xe) (IS_SRIOV_PF(xe) || IS_SRIOV_VF(xe)) + +#endif diff --git a/drivers/gpu/drm/xe/xe_sriov_printk.h b/drivers/gpu/drm/xe/xe_sriov_printk.h new file mode 100644 index 000000000000..117e1d541692 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sriov_printk.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_SRIOV_PRINTK_H_ +#define _XE_SRIOV_PRINTK_H_ + +#include <drm/drm_print.h> + +#include "xe_device_types.h" +#include "xe_sriov_types.h" + +#define xe_sriov_printk_prefix(xe) \ + ((xe)->sriov.__mode == XE_SRIOV_MODE_PF ? "PF: " : \ + (xe)->sriov.__mode == XE_SRIOV_MODE_VF ? "VF: " : "") + +#define xe_sriov_printk(xe, _level, fmt, ...) \ + drm_##_level(&(xe)->drm, "%s" fmt, xe_sriov_printk_prefix(xe), ##__VA_ARGS__) + +#define xe_sriov_err(xe, fmt, ...) \ + xe_sriov_printk((xe), err, fmt, ##__VA_ARGS__) + +#define xe_sriov_err_ratelimited(xe, fmt, ...) \ + xe_sriov_printk((xe), err_ratelimited, fmt, ##__VA_ARGS__) + +#define xe_sriov_warn(xe, fmt, ...) \ + xe_sriov_printk((xe), warn, fmt, ##__VA_ARGS__) + +#define xe_sriov_notice(xe, fmt, ...) \ + xe_sriov_printk((xe), notice, fmt, ##__VA_ARGS__) + +#define xe_sriov_info(xe, fmt, ...) \ + xe_sriov_printk((xe), info, fmt, ##__VA_ARGS__) + +#define xe_sriov_dbg(xe, fmt, ...) \ + xe_sriov_printk((xe), dbg, fmt, ##__VA_ARGS__) + +/* for low level noisy debug messages */ +#ifdef CONFIG_DRM_XE_DEBUG_SRIOV +#define xe_sriov_dbg_verbose(xe, fmt, ...) xe_sriov_dbg(xe, fmt, ##__VA_ARGS__) +#else +#define xe_sriov_dbg_verbose(xe, fmt, ...) typecheck(struct xe_device *, (xe)) +#endif + +#endif diff --git a/drivers/gpu/drm/xe/xe_sriov_types.h b/drivers/gpu/drm/xe/xe_sriov_types.h new file mode 100644 index 000000000000..999a4311b98b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sriov_types.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_SRIOV_TYPES_H_ +#define _XE_SRIOV_TYPES_H_ + +#include <linux/build_bug.h> + +/** + * enum xe_sriov_mode - SR-IOV mode + * @XE_SRIOV_MODE_NONE: bare-metal mode (non-virtualized) + * @XE_SRIOV_MODE_PF: SR-IOV Physical Function (PF) mode + * @XE_SRIOV_MODE_VF: SR-IOV Virtual Function (VF) mode + */ +enum xe_sriov_mode { + /* + * Note: We don't use default enum value 0 to allow catch any too early + * attempt of checking the SR-IOV mode prior to the actual mode probe. + */ + XE_SRIOV_MODE_NONE = 1, + XE_SRIOV_MODE_PF, + XE_SRIOV_MODE_VF, +}; +static_assert(XE_SRIOV_MODE_NONE); + +#endif diff --git a/drivers/gpu/drm/xe/xe_step.c b/drivers/gpu/drm/xe/xe_step.c new file mode 100644 index 000000000000..eaf1b718f26c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_step.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_step.h" + +#include <linux/bitfield.h> + +#include "xe_device.h" +#include "xe_platform_types.h" + +/* + * Provide mapping between PCI's revision ID to the individual GMD + * (Graphics/Media/Display) stepping values that can be compared numerically. + * + * Some platforms may have unusual ways of mapping PCI revision ID to GMD + * steppings. E.g., in some cases a higher PCI revision may translate to a + * lower stepping of the GT and/or display IP. + * + * Also note that some revisions/steppings may have been set aside as + * placeholders but never materialized in real hardware; in those cases there + * may be jumps in the revision IDs or stepping values in the tables below. + */ + +/* + * Some platforms always have the same stepping value for GT and display; + * use a macro to define these to make it easier to identify the platforms + * where the two steppings can deviate. + */ +#define COMMON_GT_MEDIA_STEP(x_) \ + .graphics = STEP_##x_, \ + .media = STEP_##x_ + +#define COMMON_STEP(x_) \ + COMMON_GT_MEDIA_STEP(x_), \ + .graphics = STEP_##x_, \ + .media = STEP_##x_, \ + .display = STEP_##x_ + +__diag_push(); +__diag_ignore_all("-Woverride-init", "Allow field overrides in table"); + +/* Same GT stepping between tgl_uy_revids and tgl_revids don't mean the same HW */ +static const struct xe_step_info tgl_revids[] = { + [0] = { COMMON_GT_MEDIA_STEP(A0), .display = STEP_B0 }, + [1] = { COMMON_GT_MEDIA_STEP(B0), .display = STEP_D0 }, +}; + +static const struct xe_step_info dg1_revids[] = { + [0] = { COMMON_STEP(A0) }, + [1] = { COMMON_STEP(B0) }, +}; + +static const struct xe_step_info adls_revids[] = { + [0x0] = { COMMON_GT_MEDIA_STEP(A0), .display = STEP_A0 }, + [0x1] = { COMMON_GT_MEDIA_STEP(A0), .display = STEP_A2 }, + [0x4] = { COMMON_GT_MEDIA_STEP(B0), .display = STEP_B0 }, + [0x8] = { COMMON_GT_MEDIA_STEP(C0), .display = STEP_B0 }, + [0xC] = { COMMON_GT_MEDIA_STEP(D0), .display = STEP_C0 }, +}; + +static const struct xe_step_info adls_rpls_revids[] = { + [0x4] = { COMMON_GT_MEDIA_STEP(D0), .display = STEP_D0 }, + [0xC] = { COMMON_GT_MEDIA_STEP(D0), .display = STEP_C0 }, +}; + +static const struct xe_step_info adlp_revids[] = { + [0x0] = { COMMON_GT_MEDIA_STEP(A0), .display = STEP_A0 }, + [0x4] = { COMMON_GT_MEDIA_STEP(B0), .display = STEP_B0 }, + [0x8] = { COMMON_GT_MEDIA_STEP(C0), .display = STEP_C0 }, + [0xC] = { COMMON_GT_MEDIA_STEP(C0), .display = STEP_D0 }, +}; + +static const struct xe_step_info adlp_rpl_revids[] = { + [0x4] = { COMMON_GT_MEDIA_STEP(C0), .display = STEP_E0 }, +}; + +static const struct xe_step_info adln_revids[] = { + [0x0] = { COMMON_GT_MEDIA_STEP(A0), .display = STEP_D0 }, +}; + +static const struct xe_step_info dg2_g10_revid_step_tbl[] = { + [0x0] = { COMMON_GT_MEDIA_STEP(A0), .display = STEP_A0 }, + [0x1] = { COMMON_GT_MEDIA_STEP(A1), .display = STEP_A0 }, + [0x4] = { COMMON_GT_MEDIA_STEP(B0), .display = STEP_B0 }, + [0x8] = { COMMON_GT_MEDIA_STEP(C0), .display = STEP_C0 }, +}; + +static const struct xe_step_info dg2_g11_revid_step_tbl[] = { + [0x0] = { COMMON_GT_MEDIA_STEP(A0), .display = STEP_B0 }, + [0x4] = { COMMON_GT_MEDIA_STEP(B0), .display = STEP_C0 }, + [0x5] = { COMMON_GT_MEDIA_STEP(B1), .display = STEP_C0 }, +}; + +static const struct xe_step_info dg2_g12_revid_step_tbl[] = { + [0x0] = { COMMON_GT_MEDIA_STEP(A0), .display = STEP_C0 }, + [0x1] = { COMMON_GT_MEDIA_STEP(A1), .display = STEP_C0 }, +}; + +static const struct xe_step_info pvc_revid_step_tbl[] = { + [0x5] = { .graphics = STEP_B0 }, + [0x6] = { .graphics = STEP_B1 }, + [0x7] = { .graphics = STEP_C0 }, +}; + +static const int pvc_basedie_subids[] = { + [0x3] = STEP_B0, + [0x4] = STEP_B1, + [0x5] = STEP_B3, +}; + +__diag_pop(); + +/** + * xe_step_pre_gmdid_get - Determine IP steppings from PCI revid + * @xe: Xe device + * + * Convert the PCI revid into proper IP steppings. This should only be + * used on platforms that do not have GMD_ID support. + */ +struct xe_step_info xe_step_pre_gmdid_get(struct xe_device *xe) +{ + const struct xe_step_info *revids = NULL; + struct xe_step_info step = {}; + u16 revid = xe->info.revid; + int size = 0; + const int *basedie_info = NULL; + int basedie_size = 0; + int baseid = 0; + + if (xe->info.platform == XE_PVC) { + baseid = FIELD_GET(GENMASK(5, 3), xe->info.revid); + revid = FIELD_GET(GENMASK(2, 0), xe->info.revid); + revids = pvc_revid_step_tbl; + size = ARRAY_SIZE(pvc_revid_step_tbl); + basedie_info = pvc_basedie_subids; + basedie_size = ARRAY_SIZE(pvc_basedie_subids); + } else if (xe->info.subplatform == XE_SUBPLATFORM_DG2_G10) { + revids = dg2_g10_revid_step_tbl; + size = ARRAY_SIZE(dg2_g10_revid_step_tbl); + } else if (xe->info.subplatform == XE_SUBPLATFORM_DG2_G11) { + revids = dg2_g11_revid_step_tbl; + size = ARRAY_SIZE(dg2_g11_revid_step_tbl); + } else if (xe->info.subplatform == XE_SUBPLATFORM_DG2_G12) { + revids = dg2_g12_revid_step_tbl; + size = ARRAY_SIZE(dg2_g12_revid_step_tbl); + } else if (xe->info.platform == XE_ALDERLAKE_N) { + revids = adln_revids; + size = ARRAY_SIZE(adln_revids); + } else if (xe->info.subplatform == XE_SUBPLATFORM_ALDERLAKE_S_RPLS) { + revids = adls_rpls_revids; + size = ARRAY_SIZE(adls_rpls_revids); + } else if (xe->info.subplatform == XE_SUBPLATFORM_ALDERLAKE_P_RPLU) { + revids = adlp_rpl_revids; + size = ARRAY_SIZE(adlp_rpl_revids); + } else if (xe->info.platform == XE_ALDERLAKE_P) { + revids = adlp_revids; + size = ARRAY_SIZE(adlp_revids); + } else if (xe->info.platform == XE_ALDERLAKE_S) { + revids = adls_revids; + size = ARRAY_SIZE(adls_revids); + } else if (xe->info.platform == XE_DG1) { + revids = dg1_revids; + size = ARRAY_SIZE(dg1_revids); + } else if (xe->info.platform == XE_TIGERLAKE) { + revids = tgl_revids; + size = ARRAY_SIZE(tgl_revids); + } + + /* Not using the stepping scheme for the platform yet. */ + if (!revids) + return step; + + if (revid < size && revids[revid].graphics != STEP_NONE) { + step = revids[revid]; + } else { + drm_warn(&xe->drm, "Unknown revid 0x%02x\n", revid); + + /* + * If we hit a gap in the revid array, use the information for + * the next revid. + * + * This may be wrong in all sorts of ways, especially if the + * steppings in the array are not monotonically increasing, but + * it's better than defaulting to 0. + */ + while (revid < size && revids[revid].graphics == STEP_NONE) + revid++; + + if (revid < size) { + drm_dbg(&xe->drm, "Using steppings for revid 0x%02x\n", + revid); + step = revids[revid]; + } else { + drm_dbg(&xe->drm, "Using future steppings\n"); + step.graphics = STEP_FUTURE; + step.display = STEP_FUTURE; + } + } + + drm_WARN_ON(&xe->drm, step.graphics == STEP_NONE); + + if (basedie_info && basedie_size) { + if (baseid < basedie_size && basedie_info[baseid] != STEP_NONE) { + step.basedie = basedie_info[baseid]; + } else { + drm_warn(&xe->drm, "Unknown baseid 0x%02x\n", baseid); + step.basedie = STEP_FUTURE; + } + } + + return step; +} + +/** + * xe_step_gmdid_get - Determine IP steppings from GMD_ID revid fields + * @xe: Xe device + * @graphics_gmdid_revid: value of graphics GMD_ID register's revid field + * @media_gmdid_revid: value of media GMD_ID register's revid field + * + * Convert the revid fields of the GMD_ID registers into proper IP steppings. + * + * GMD_ID revid values are currently expected to have consistent meanings on + * all platforms: major steppings (A0, B0, etc.) are 4 apart, with minor + * steppings (A1, A2, etc.) taking the values in between. + */ +struct xe_step_info xe_step_gmdid_get(struct xe_device *xe, + u32 graphics_gmdid_revid, + u32 media_gmdid_revid) +{ + struct xe_step_info step = { + .graphics = STEP_A0 + graphics_gmdid_revid, + .media = STEP_A0 + media_gmdid_revid, + }; + + if (step.graphics >= STEP_FUTURE) { + step.graphics = STEP_FUTURE; + drm_dbg(&xe->drm, "Graphics GMD_ID revid value %d treated as future stepping\n", + graphics_gmdid_revid); + } + + if (step.media >= STEP_FUTURE) { + step.media = STEP_FUTURE; + drm_dbg(&xe->drm, "Media GMD_ID revid value %d treated as future stepping\n", + media_gmdid_revid); + } + + return step; +} + +#define STEP_NAME_CASE(name) \ + case STEP_##name: \ + return #name; + +const char *xe_step_name(enum xe_step step) +{ + switch (step) { + STEP_NAME_LIST(STEP_NAME_CASE); + + default: + return "**"; + } +} diff --git a/drivers/gpu/drm/xe/xe_step.h b/drivers/gpu/drm/xe/xe_step.h new file mode 100644 index 000000000000..686cb59200c2 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_step.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_STEP_H_ +#define _XE_STEP_H_ + +#include <linux/types.h> + +#include "xe_step_types.h" + +struct xe_device; + +struct xe_step_info xe_step_pre_gmdid_get(struct xe_device *xe); +struct xe_step_info xe_step_gmdid_get(struct xe_device *xe, + u32 graphics_gmdid_revid, + u32 media_gmdid_revid); +static inline u32 xe_step_to_gmdid(enum xe_step step) { return step - STEP_A0; } + +const char *xe_step_name(enum xe_step step); + +#endif diff --git a/drivers/gpu/drm/xe/xe_step_types.h b/drivers/gpu/drm/xe/xe_step_types.h new file mode 100644 index 000000000000..ccc9b4795e95 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_step_types.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_STEP_TYPES_H_ +#define _XE_STEP_TYPES_H_ + +#include <linux/types.h> + +struct xe_step_info { + u8 graphics; + u8 media; + u8 display; + u8 basedie; +}; + +#define STEP_ENUM_VAL(name) STEP_##name, + +#define STEP_NAME_LIST(func) \ + func(A0) \ + func(A1) \ + func(A2) \ + func(A3) \ + func(B0) \ + func(B1) \ + func(B2) \ + func(B3) \ + func(C0) \ + func(C1) \ + func(C2) \ + func(C3) \ + func(D0) \ + func(D1) \ + func(D2) \ + func(D3) \ + func(E0) + +/* + * Symbolic steppings that do not match the hardware. These are valid both as gt + * and display steppings as symbolic names. + */ +enum xe_step { + STEP_NONE = 0, + STEP_NAME_LIST(STEP_ENUM_VAL) + STEP_FUTURE, + STEP_FOREVER, +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_sync.c b/drivers/gpu/drm/xe/xe_sync.c new file mode 100644 index 000000000000..e4c220cf9115 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sync.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_sync.h" + +#include <linux/dma-fence-array.h> +#include <linux/kthread.h> +#include <linux/sched/mm.h> +#include <linux/uaccess.h> + +#include <drm/drm_print.h> +#include <drm/drm_syncobj.h> +#include <drm/xe_drm.h> + +#include "xe_device_types.h" +#include "xe_exec_queue.h" +#include "xe_macros.h" +#include "xe_sched_job_types.h" + +struct user_fence { + struct xe_device *xe; + struct kref refcount; + struct dma_fence_cb cb; + struct work_struct worker; + struct mm_struct *mm; + u64 __user *addr; + u64 value; +}; + +static void user_fence_destroy(struct kref *kref) +{ + struct user_fence *ufence = container_of(kref, struct user_fence, + refcount); + + mmdrop(ufence->mm); + kfree(ufence); +} + +static void user_fence_get(struct user_fence *ufence) +{ + kref_get(&ufence->refcount); +} + +static void user_fence_put(struct user_fence *ufence) +{ + kref_put(&ufence->refcount, user_fence_destroy); +} + +static struct user_fence *user_fence_create(struct xe_device *xe, u64 addr, + u64 value) +{ + struct user_fence *ufence; + + ufence = kmalloc(sizeof(*ufence), GFP_KERNEL); + if (!ufence) + return NULL; + + ufence->xe = xe; + kref_init(&ufence->refcount); + ufence->addr = u64_to_user_ptr(addr); + ufence->value = value; + ufence->mm = current->mm; + mmgrab(ufence->mm); + + return ufence; +} + +static void user_fence_worker(struct work_struct *w) +{ + struct user_fence *ufence = container_of(w, struct user_fence, worker); + + if (mmget_not_zero(ufence->mm)) { + kthread_use_mm(ufence->mm); + if (copy_to_user(ufence->addr, &ufence->value, sizeof(ufence->value))) + XE_WARN_ON("Copy to user failed"); + kthread_unuse_mm(ufence->mm); + mmput(ufence->mm); + } + + wake_up_all(&ufence->xe->ufence_wq); + user_fence_put(ufence); +} + +static void kick_ufence(struct user_fence *ufence, struct dma_fence *fence) +{ + INIT_WORK(&ufence->worker, user_fence_worker); + queue_work(ufence->xe->ordered_wq, &ufence->worker); + dma_fence_put(fence); +} + +static void user_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb) +{ + struct user_fence *ufence = container_of(cb, struct user_fence, cb); + + kick_ufence(ufence, fence); +} + +int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, + struct xe_sync_entry *sync, + struct drm_xe_sync __user *sync_user, + unsigned int flags) +{ + struct drm_xe_sync sync_in; + int err; + bool exec = flags & SYNC_PARSE_FLAG_EXEC; + bool in_lr_mode = flags & SYNC_PARSE_FLAG_LR_MODE; + bool disallow_user_fence = flags & SYNC_PARSE_FLAG_DISALLOW_USER_FENCE; + bool signal; + + if (copy_from_user(&sync_in, sync_user, sizeof(*sync_user))) + return -EFAULT; + + if (XE_IOCTL_DBG(xe, sync_in.flags & ~DRM_XE_SYNC_FLAG_SIGNAL) || + XE_IOCTL_DBG(xe, sync_in.reserved[0] || sync_in.reserved[1])) + return -EINVAL; + + signal = sync_in.flags & DRM_XE_SYNC_FLAG_SIGNAL; + switch (sync_in.type) { + case DRM_XE_SYNC_TYPE_SYNCOBJ: + if (XE_IOCTL_DBG(xe, in_lr_mode && signal)) + return -EOPNOTSUPP; + + if (XE_IOCTL_DBG(xe, upper_32_bits(sync_in.addr))) + return -EINVAL; + + sync->syncobj = drm_syncobj_find(xef->drm, sync_in.handle); + if (XE_IOCTL_DBG(xe, !sync->syncobj)) + return -ENOENT; + + if (!signal) { + sync->fence = drm_syncobj_fence_get(sync->syncobj); + if (XE_IOCTL_DBG(xe, !sync->fence)) + return -EINVAL; + } + break; + + case DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ: + if (XE_IOCTL_DBG(xe, in_lr_mode && signal)) + return -EOPNOTSUPP; + + if (XE_IOCTL_DBG(xe, upper_32_bits(sync_in.addr))) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, sync_in.timeline_value == 0)) + return -EINVAL; + + sync->syncobj = drm_syncobj_find(xef->drm, sync_in.handle); + if (XE_IOCTL_DBG(xe, !sync->syncobj)) + return -ENOENT; + + if (signal) { + sync->chain_fence = dma_fence_chain_alloc(); + if (!sync->chain_fence) + return -ENOMEM; + } else { + sync->fence = drm_syncobj_fence_get(sync->syncobj); + if (XE_IOCTL_DBG(xe, !sync->fence)) + return -EINVAL; + + err = dma_fence_chain_find_seqno(&sync->fence, + sync_in.timeline_value); + if (err) + return err; + } + break; + + case DRM_XE_SYNC_TYPE_USER_FENCE: + if (XE_IOCTL_DBG(xe, disallow_user_fence)) + return -EOPNOTSUPP; + + if (XE_IOCTL_DBG(xe, !signal)) + return -EOPNOTSUPP; + + if (XE_IOCTL_DBG(xe, sync_in.addr & 0x7)) + return -EINVAL; + + if (exec) { + sync->addr = sync_in.addr; + } else { + sync->ufence = user_fence_create(xe, sync_in.addr, + sync_in.timeline_value); + if (XE_IOCTL_DBG(xe, !sync->ufence)) + return -ENOMEM; + } + + break; + + default: + return -EINVAL; + } + + sync->type = sync_in.type; + sync->flags = sync_in.flags; + sync->timeline_value = sync_in.timeline_value; + + return 0; +} + +int xe_sync_entry_wait(struct xe_sync_entry *sync) +{ + if (sync->fence) + dma_fence_wait(sync->fence, true); + + return 0; +} + +int xe_sync_entry_add_deps(struct xe_sync_entry *sync, struct xe_sched_job *job) +{ + int err; + + if (sync->fence) { + err = drm_sched_job_add_dependency(&job->drm, + dma_fence_get(sync->fence)); + if (err) { + dma_fence_put(sync->fence); + return err; + } + } + + return 0; +} + +void xe_sync_entry_signal(struct xe_sync_entry *sync, struct xe_sched_job *job, + struct dma_fence *fence) +{ + if (!(sync->flags & DRM_XE_SYNC_FLAG_SIGNAL)) + return; + + if (sync->chain_fence) { + drm_syncobj_add_point(sync->syncobj, sync->chain_fence, + fence, sync->timeline_value); + /* + * The chain's ownership is transferred to the + * timeline. + */ + sync->chain_fence = NULL; + } else if (sync->syncobj) { + drm_syncobj_replace_fence(sync->syncobj, fence); + } else if (sync->ufence) { + int err; + + dma_fence_get(fence); + user_fence_get(sync->ufence); + err = dma_fence_add_callback(fence, &sync->ufence->cb, + user_fence_cb); + if (err == -ENOENT) { + kick_ufence(sync->ufence, fence); + } else if (err) { + XE_WARN_ON("failed to add user fence"); + user_fence_put(sync->ufence); + dma_fence_put(fence); + } + } else if (sync->type == DRM_XE_SYNC_TYPE_USER_FENCE) { + job->user_fence.used = true; + job->user_fence.addr = sync->addr; + job->user_fence.value = sync->timeline_value; + } +} + +void xe_sync_entry_cleanup(struct xe_sync_entry *sync) +{ + if (sync->syncobj) + drm_syncobj_put(sync->syncobj); + if (sync->fence) + dma_fence_put(sync->fence); + if (sync->chain_fence) + dma_fence_put(&sync->chain_fence->base); + if (sync->ufence) + user_fence_put(sync->ufence); +} + +/** + * xe_sync_in_fence_get() - Get a fence from syncs, exec queue, and VM + * @sync: input syncs + * @num_sync: number of syncs + * @q: exec queue + * @vm: VM + * + * Get a fence from syncs, exec queue, and VM. If syncs contain in-fences create + * and return a composite fence of all in-fences + last fence. If no in-fences + * return last fence on input exec queue. Caller must drop reference to + * returned fence. + * + * Return: fence on success, ERR_PTR(-ENOMEM) on failure + */ +struct dma_fence * +xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync, + struct xe_exec_queue *q, struct xe_vm *vm) +{ + struct dma_fence **fences = NULL; + struct dma_fence_array *cf = NULL; + struct dma_fence *fence; + int i, num_in_fence = 0, current_fence = 0; + + lockdep_assert_held(&vm->lock); + + /* Count in-fences */ + for (i = 0; i < num_sync; ++i) { + if (sync[i].fence) { + ++num_in_fence; + fence = sync[i].fence; + } + } + + /* Easy case... */ + if (!num_in_fence) { + fence = xe_exec_queue_last_fence_get(q, vm); + dma_fence_get(fence); + return fence; + } + + /* Create composite fence */ + fences = kmalloc_array(num_in_fence + 1, sizeof(*fences), GFP_KERNEL); + if (!fences) + return ERR_PTR(-ENOMEM); + for (i = 0; i < num_sync; ++i) { + if (sync[i].fence) { + dma_fence_get(sync[i].fence); + fences[current_fence++] = sync[i].fence; + } + } + fences[current_fence++] = xe_exec_queue_last_fence_get(q, vm); + dma_fence_get(fences[current_fence - 1]); + cf = dma_fence_array_create(num_in_fence, fences, + vm->composite_fence_ctx, + vm->composite_fence_seqno++, + false); + if (!cf) { + --vm->composite_fence_seqno; + goto err_out; + } + + return &cf->base; + +err_out: + while (current_fence) + dma_fence_put(fences[--current_fence]); + kfree(fences); + kfree(cf); + + return ERR_PTR(-ENOMEM); +} diff --git a/drivers/gpu/drm/xe/xe_sync.h b/drivers/gpu/drm/xe/xe_sync.h new file mode 100644 index 000000000000..d284afbe917c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sync.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_SYNC_H_ +#define _XE_SYNC_H_ + +#include "xe_sync_types.h" + +struct xe_device; +struct xe_exec_queue; +struct xe_file; +struct xe_sched_job; +struct xe_vm; + +#define SYNC_PARSE_FLAG_EXEC BIT(0) +#define SYNC_PARSE_FLAG_LR_MODE BIT(1) +#define SYNC_PARSE_FLAG_DISALLOW_USER_FENCE BIT(2) + +int xe_sync_entry_parse(struct xe_device *xe, struct xe_file *xef, + struct xe_sync_entry *sync, + struct drm_xe_sync __user *sync_user, + unsigned int flags); +int xe_sync_entry_wait(struct xe_sync_entry *sync); +int xe_sync_entry_add_deps(struct xe_sync_entry *sync, + struct xe_sched_job *job); +void xe_sync_entry_signal(struct xe_sync_entry *sync, + struct xe_sched_job *job, + struct dma_fence *fence); +void xe_sync_entry_cleanup(struct xe_sync_entry *sync); +struct dma_fence * +xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync, + struct xe_exec_queue *q, struct xe_vm *vm); + +#endif diff --git a/drivers/gpu/drm/xe/xe_sync_types.h b/drivers/gpu/drm/xe/xe_sync_types.h new file mode 100644 index 000000000000..852db5e7884f --- /dev/null +++ b/drivers/gpu/drm/xe/xe_sync_types.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_SYNC_TYPES_H_ +#define _XE_SYNC_TYPES_H_ + +#include <linux/types.h> + +struct drm_syncobj; +struct dma_fence; +struct dma_fence_chain; +struct drm_xe_sync; +struct user_fence; + +struct xe_sync_entry { + struct drm_syncobj *syncobj; + struct dma_fence *fence; + struct dma_fence_chain *chain_fence; + struct user_fence *ufence; + u64 addr; + u64 timeline_value; + u32 type; + u32 flags; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_tile.c b/drivers/gpu/drm/xe/xe_tile.c new file mode 100644 index 000000000000..044c20881de7 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_tile.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <drm/drm_managed.h> + +#include "xe_device.h" +#include "xe_ggtt.h" +#include "xe_gt.h" +#include "xe_migrate.h" +#include "xe_sa.h" +#include "xe_tile.h" +#include "xe_tile_sysfs.h" +#include "xe_ttm_vram_mgr.h" +#include "xe_wa.h" + +/** + * DOC: Multi-tile Design + * + * Different vendors use the term "tile" a bit differently, but in the Intel + * world, a 'tile' is pretty close to what most people would think of as being + * a complete GPU. When multiple GPUs are placed behind a single PCI device, + * that's what is referred to as a "multi-tile device." In such cases, pretty + * much all hardware is replicated per-tile, although certain responsibilities + * like PCI communication, reporting of interrupts to the OS, etc. are handled + * solely by the "root tile." A multi-tile platform takes care of tying the + * tiles together in a way such that interrupt notifications from remote tiles + * are forwarded to the root tile, the per-tile vram is combined into a single + * address space, etc. + * + * In contrast, a "GT" (which officially stands for "Graphics Technology") is + * the subset of a GPU/tile that is responsible for implementing graphics + * and/or media operations. The GT is where a lot of the driver implementation + * happens since it's where the hardware engines, the execution units, and the + * GuC all reside. + * + * Historically most Intel devices were single-tile devices that contained a + * single GT. PVC is an example of an Intel platform built on a multi-tile + * design (i.e., multiple GPUs behind a single PCI device); each PVC tile only + * has a single GT. In contrast, platforms like MTL that have separate chips + * for render and media IP are still only a single logical GPU, but the + * graphics and media IP blocks are each exposed as a separate GT within that + * single GPU. This is important from a software perspective because multi-GT + * platforms like MTL only replicate a subset of the GPU hardware and behave + * differently than multi-tile platforms like PVC where nearly everything is + * replicated. + * + * Per-tile functionality (shared by all GTs within the tile): + * - Complete 4MB MMIO space (containing SGunit/SoC registers, GT + * registers, display registers, etc.) + * - Global GTT + * - VRAM (if discrete) + * - Interrupt flows + * - Migration context + * - kernel batchbuffer pool + * - Primary GT + * - Media GT (if media version >= 13) + * + * Per-GT functionality: + * - GuC + * - Hardware engines + * - Programmable hardware units (subslices, EUs) + * - GSI subset of registers (multiple copies of these registers reside + * within the complete MMIO space provided by the tile, but at different + * offsets --- 0 for render, 0x380000 for media) + * - Multicast register steering + * - TLBs to cache page table translations + * - Reset capability + * - Low-level power management (e.g., C6) + * - Clock frequency + * - MOCS and PAT programming + */ + +/** + * xe_tile_alloc - Perform per-tile memory allocation + * @tile: Tile to perform allocations for + * + * Allocates various per-tile data structures using DRM-managed allocations. + * Does not touch the hardware. + * + * Returns -ENOMEM if allocations fail, otherwise 0. + */ +static int xe_tile_alloc(struct xe_tile *tile) +{ + struct drm_device *drm = &tile_to_xe(tile)->drm; + + tile->mem.ggtt = drmm_kzalloc(drm, sizeof(*tile->mem.ggtt), + GFP_KERNEL); + if (!tile->mem.ggtt) + return -ENOMEM; + tile->mem.ggtt->tile = tile; + + tile->mem.vram_mgr = drmm_kzalloc(drm, sizeof(*tile->mem.vram_mgr), GFP_KERNEL); + if (!tile->mem.vram_mgr) + return -ENOMEM; + + return 0; +} + +/** + * xe_tile_init_early - Initialize the tile and primary GT + * @tile: Tile to initialize + * @xe: Parent Xe device + * @id: Tile ID + * + * Initializes per-tile resources that don't require any interactions with the + * hardware or any knowledge about the Graphics/Media IP version. + * + * Returns: 0 on success, negative error code on error. + */ +int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id) +{ + int err; + + tile->xe = xe; + tile->id = id; + + err = xe_tile_alloc(tile); + if (err) + return err; + + tile->primary_gt = xe_gt_alloc(tile); + if (IS_ERR(tile->primary_gt)) + return PTR_ERR(tile->primary_gt); + + return 0; +} + +static int tile_ttm_mgr_init(struct xe_tile *tile) +{ + struct xe_device *xe = tile_to_xe(tile); + int err; + + if (tile->mem.vram.usable_size) { + err = xe_ttm_vram_mgr_init(tile, tile->mem.vram_mgr); + if (err) + return err; + xe->info.mem_region_mask |= BIT(tile->id) << 1; + } + + return 0; +} + +/** + * xe_tile_init_noalloc - Init tile up to the point where allocations can happen. + * @tile: The tile to initialize. + * + * This function prepares the tile to allow memory allocations to VRAM, but is + * not allowed to allocate memory itself. This state is useful for display + * readout, because the inherited display framebuffer will otherwise be + * overwritten as it is usually put at the start of VRAM. + * + * Note that since this is tile initialization, it should not perform any + * GT-specific operations, and thus does not need to hold GT forcewake. + * + * Returns: 0 on success, negative error code on error. + */ +int xe_tile_init_noalloc(struct xe_tile *tile) +{ + int err; + + xe_device_mem_access_get(tile_to_xe(tile)); + + err = tile_ttm_mgr_init(tile); + if (err) + goto err_mem_access; + + tile->mem.kernel_bb_pool = xe_sa_bo_manager_init(tile, SZ_1M, 16); + if (IS_ERR(tile->mem.kernel_bb_pool)) + err = PTR_ERR(tile->mem.kernel_bb_pool); + + xe_wa_apply_tile_workarounds(tile); + + xe_tile_sysfs_init(tile); + +err_mem_access: + xe_device_mem_access_put(tile_to_xe(tile)); + return err; +} + +void xe_tile_migrate_wait(struct xe_tile *tile) +{ + xe_migrate_wait(tile->migrate); +} diff --git a/drivers/gpu/drm/xe/xe_tile.h b/drivers/gpu/drm/xe/xe_tile.h new file mode 100644 index 000000000000..1c9e42ade6b0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_tile.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_TILE_H_ +#define _XE_TILE_H_ + +#include "xe_device_types.h" + +struct xe_tile; + +int xe_tile_init_early(struct xe_tile *tile, struct xe_device *xe, u8 id); +int xe_tile_init_noalloc(struct xe_tile *tile); + +void xe_tile_migrate_wait(struct xe_tile *tile); + +#endif diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs.c b/drivers/gpu/drm/xe/xe_tile_sysfs.c new file mode 100644 index 000000000000..0f8d3e7fce46 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_tile_sysfs.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <drm/drm_managed.h> + +#include "xe_tile.h" +#include "xe_tile_sysfs.h" + +static void xe_tile_sysfs_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct kobj_type xe_tile_sysfs_kobj_type = { + .release = xe_tile_sysfs_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, +}; + +static void tile_sysfs_fini(struct drm_device *drm, void *arg) +{ + struct xe_tile *tile = arg; + + kobject_put(tile->sysfs); +} + +void xe_tile_sysfs_init(struct xe_tile *tile) +{ + struct xe_device *xe = tile_to_xe(tile); + struct device *dev = xe->drm.dev; + struct kobj_tile *kt; + int err; + + kt = kzalloc(sizeof(*kt), GFP_KERNEL); + if (!kt) + return; + + kobject_init(&kt->base, &xe_tile_sysfs_kobj_type); + kt->tile = tile; + + err = kobject_add(&kt->base, &dev->kobj, "tile%d", tile->id); + if (err) { + kobject_put(&kt->base); + drm_warn(&xe->drm, "failed to register TILE sysfs directory, err: %d\n", err); + return; + } + + tile->sysfs = &kt->base; + + err = drmm_add_action_or_reset(&xe->drm, tile_sysfs_fini, tile); + if (err) + drm_warn(&xe->drm, "%s: drmm_add_action_or_reset failed, err: %d\n", + __func__, err); +} diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs.h b/drivers/gpu/drm/xe/xe_tile_sysfs.h new file mode 100644 index 000000000000..e4f065039eba --- /dev/null +++ b/drivers/gpu/drm/xe/xe_tile_sysfs.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_TILE_SYSFS_H_ +#define _XE_TILE_SYSFS_H_ + +#include "xe_tile_sysfs_types.h" + +void xe_tile_sysfs_init(struct xe_tile *tile); + +static inline struct xe_tile * +kobj_to_tile(struct kobject *kobj) +{ + return container_of(kobj, struct kobj_tile, base)->tile; +} + +#endif /* _XE_TILE_SYSFS_H_ */ diff --git a/drivers/gpu/drm/xe/xe_tile_sysfs_types.h b/drivers/gpu/drm/xe/xe_tile_sysfs_types.h new file mode 100644 index 000000000000..75906ba11a9e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_tile_sysfs_types.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_TILE_SYSFS_TYPES_H_ +#define _XE_TILE_SYSFS_TYPES_H_ + +#include <linux/kobject.h> + +struct xe_tile; + +/** + * struct kobj_tile - A tile's kobject struct that connects the kobject + * and the TILE + * + * When dealing with multiple TILEs, this struct helps to understand which + * TILE needs to be addressed on a given sysfs call. + */ +struct kobj_tile { + /** @base: The actual kobject */ + struct kobject base; + /** @tile: A pointer to the tile itself */ + struct xe_tile *tile; +}; + +#endif /* _XE_TILE_SYSFS_TYPES_H_ */ diff --git a/drivers/gpu/drm/xe/xe_trace.c b/drivers/gpu/drm/xe/xe_trace.c new file mode 100644 index 000000000000..2527c556bff1 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_trace.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "xe_trace.h" +#endif diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h new file mode 100644 index 000000000000..95163c303f3e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_trace.h @@ -0,0 +1,608 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright © 2022 Intel Corporation + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xe + +#if !defined(_XE_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) +#define _XE_TRACE_H_ + +#include <linux/tracepoint.h> +#include <linux/types.h> + +#include "xe_bo_types.h" +#include "xe_exec_queue_types.h" +#include "xe_gpu_scheduler_types.h" +#include "xe_gt_tlb_invalidation_types.h" +#include "xe_gt_types.h" +#include "xe_guc_exec_queue_types.h" +#include "xe_sched_job.h" +#include "xe_vm.h" + +DECLARE_EVENT_CLASS(xe_gt_tlb_invalidation_fence, + TP_PROTO(struct xe_gt_tlb_invalidation_fence *fence), + TP_ARGS(fence), + + TP_STRUCT__entry( + __field(u64, fence) + __field(int, seqno) + ), + + TP_fast_assign( + __entry->fence = (u64)fence; + __entry->seqno = fence->seqno; + ), + + TP_printk("fence=0x%016llx, seqno=%d", + __entry->fence, __entry->seqno) +); + +DEFINE_EVENT(xe_gt_tlb_invalidation_fence, xe_gt_tlb_invalidation_fence_create, + TP_PROTO(struct xe_gt_tlb_invalidation_fence *fence), + TP_ARGS(fence) +); + +DEFINE_EVENT(xe_gt_tlb_invalidation_fence, + xe_gt_tlb_invalidation_fence_work_func, + TP_PROTO(struct xe_gt_tlb_invalidation_fence *fence), + TP_ARGS(fence) +); + +DEFINE_EVENT(xe_gt_tlb_invalidation_fence, xe_gt_tlb_invalidation_fence_cb, + TP_PROTO(struct xe_gt_tlb_invalidation_fence *fence), + TP_ARGS(fence) +); + +DEFINE_EVENT(xe_gt_tlb_invalidation_fence, xe_gt_tlb_invalidation_fence_send, + TP_PROTO(struct xe_gt_tlb_invalidation_fence *fence), + TP_ARGS(fence) +); + +DEFINE_EVENT(xe_gt_tlb_invalidation_fence, xe_gt_tlb_invalidation_fence_recv, + TP_PROTO(struct xe_gt_tlb_invalidation_fence *fence), + TP_ARGS(fence) +); + +DEFINE_EVENT(xe_gt_tlb_invalidation_fence, xe_gt_tlb_invalidation_fence_signal, + TP_PROTO(struct xe_gt_tlb_invalidation_fence *fence), + TP_ARGS(fence) +); + +DEFINE_EVENT(xe_gt_tlb_invalidation_fence, xe_gt_tlb_invalidation_fence_timeout, + TP_PROTO(struct xe_gt_tlb_invalidation_fence *fence), + TP_ARGS(fence) +); + +DECLARE_EVENT_CLASS(xe_bo, + TP_PROTO(struct xe_bo *bo), + TP_ARGS(bo), + + TP_STRUCT__entry( + __field(size_t, size) + __field(u32, flags) + __field(u64, vm) + ), + + TP_fast_assign( + __entry->size = bo->size; + __entry->flags = bo->flags; + __entry->vm = (unsigned long)bo->vm; + ), + + TP_printk("size=%zu, flags=0x%02x, vm=0x%016llx", + __entry->size, __entry->flags, __entry->vm) +); + +DEFINE_EVENT(xe_bo, xe_bo_cpu_fault, + TP_PROTO(struct xe_bo *bo), + TP_ARGS(bo) +); + +DEFINE_EVENT(xe_bo, xe_bo_move, + TP_PROTO(struct xe_bo *bo), + TP_ARGS(bo) +); + +DECLARE_EVENT_CLASS(xe_exec_queue, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q), + + TP_STRUCT__entry( + __field(enum xe_engine_class, class) + __field(u32, logical_mask) + __field(u8, gt_id) + __field(u16, width) + __field(u16, guc_id) + __field(u32, guc_state) + __field(u32, flags) + ), + + TP_fast_assign( + __entry->class = q->class; + __entry->logical_mask = q->logical_mask; + __entry->gt_id = q->gt->info.id; + __entry->width = q->width; + __entry->guc_id = q->guc->id; + __entry->guc_state = atomic_read(&q->guc->state); + __entry->flags = q->flags; + ), + + TP_printk("%d:0x%x, gt=%d, width=%d, guc_id=%d, guc_state=0x%x, flags=0x%x", + __entry->class, __entry->logical_mask, + __entry->gt_id, __entry->width, __entry->guc_id, + __entry->guc_state, __entry->flags) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_create, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_supress_resume, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_submit, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_scheduling_enable, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_scheduling_disable, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_scheduling_done, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_register, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_deregister, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_deregister_done, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_close, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_kill, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_cleanup_entity, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_destroy, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_reset, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_memory_cat_error, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_stop, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_resubmit, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DEFINE_EVENT(xe_exec_queue, xe_exec_queue_lr_cleanup, + TP_PROTO(struct xe_exec_queue *q), + TP_ARGS(q) +); + +DECLARE_EVENT_CLASS(xe_sched_job, + TP_PROTO(struct xe_sched_job *job), + TP_ARGS(job), + + TP_STRUCT__entry( + __field(u32, seqno) + __field(u16, guc_id) + __field(u32, guc_state) + __field(u32, flags) + __field(int, error) + __field(u64, fence) + __field(u64, batch_addr) + ), + + TP_fast_assign( + __entry->seqno = xe_sched_job_seqno(job); + __entry->guc_id = job->q->guc->id; + __entry->guc_state = + atomic_read(&job->q->guc->state); + __entry->flags = job->q->flags; + __entry->error = job->fence->error; + __entry->fence = (unsigned long)job->fence; + __entry->batch_addr = (u64)job->batch_addr[0]; + ), + + TP_printk("fence=0x%016llx, seqno=%u, guc_id=%d, batch_addr=0x%012llx, guc_state=0x%x, flags=0x%x, error=%d", + __entry->fence, __entry->seqno, __entry->guc_id, + __entry->batch_addr, __entry->guc_state, + __entry->flags, __entry->error) +); + +DEFINE_EVENT(xe_sched_job, xe_sched_job_create, + TP_PROTO(struct xe_sched_job *job), + TP_ARGS(job) +); + +DEFINE_EVENT(xe_sched_job, xe_sched_job_exec, + TP_PROTO(struct xe_sched_job *job), + TP_ARGS(job) +); + +DEFINE_EVENT(xe_sched_job, xe_sched_job_run, + TP_PROTO(struct xe_sched_job *job), + TP_ARGS(job) +); + +DEFINE_EVENT(xe_sched_job, xe_sched_job_free, + TP_PROTO(struct xe_sched_job *job), + TP_ARGS(job) +); + +DEFINE_EVENT(xe_sched_job, xe_sched_job_timedout, + TP_PROTO(struct xe_sched_job *job), + TP_ARGS(job) +); + +DEFINE_EVENT(xe_sched_job, xe_sched_job_set_error, + TP_PROTO(struct xe_sched_job *job), + TP_ARGS(job) +); + +DEFINE_EVENT(xe_sched_job, xe_sched_job_ban, + TP_PROTO(struct xe_sched_job *job), + TP_ARGS(job) +); + +DECLARE_EVENT_CLASS(xe_sched_msg, + TP_PROTO(struct xe_sched_msg *msg), + TP_ARGS(msg), + + TP_STRUCT__entry( + __field(u32, opcode) + __field(u16, guc_id) + ), + + TP_fast_assign( + __entry->opcode = msg->opcode; + __entry->guc_id = + ((struct xe_exec_queue *)msg->private_data)->guc->id; + ), + + TP_printk("guc_id=%d, opcode=%u", __entry->guc_id, + __entry->opcode) +); + +DEFINE_EVENT(xe_sched_msg, xe_sched_msg_add, + TP_PROTO(struct xe_sched_msg *msg), + TP_ARGS(msg) +); + +DEFINE_EVENT(xe_sched_msg, xe_sched_msg_recv, + TP_PROTO(struct xe_sched_msg *msg), + TP_ARGS(msg) +); + +DECLARE_EVENT_CLASS(xe_hw_fence, + TP_PROTO(struct xe_hw_fence *fence), + TP_ARGS(fence), + + TP_STRUCT__entry( + __field(u64, ctx) + __field(u32, seqno) + __field(u64, fence) + ), + + TP_fast_assign( + __entry->ctx = fence->dma.context; + __entry->seqno = fence->dma.seqno; + __entry->fence = (unsigned long)fence; + ), + + TP_printk("ctx=0x%016llx, fence=0x%016llx, seqno=%u", + __entry->ctx, __entry->fence, __entry->seqno) +); + +DEFINE_EVENT(xe_hw_fence, xe_hw_fence_create, + TP_PROTO(struct xe_hw_fence *fence), + TP_ARGS(fence) +); + +DEFINE_EVENT(xe_hw_fence, xe_hw_fence_signal, + TP_PROTO(struct xe_hw_fence *fence), + TP_ARGS(fence) +); + +DEFINE_EVENT(xe_hw_fence, xe_hw_fence_try_signal, + TP_PROTO(struct xe_hw_fence *fence), + TP_ARGS(fence) +); + +DEFINE_EVENT(xe_hw_fence, xe_hw_fence_free, + TP_PROTO(struct xe_hw_fence *fence), + TP_ARGS(fence) +); + +DECLARE_EVENT_CLASS(xe_vma, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma), + + TP_STRUCT__entry( + __field(u64, vma) + __field(u32, asid) + __field(u64, start) + __field(u64, end) + __field(u64, ptr) + ), + + TP_fast_assign( + __entry->vma = (unsigned long)vma; + __entry->asid = xe_vma_vm(vma)->usm.asid; + __entry->start = xe_vma_start(vma); + __entry->end = xe_vma_end(vma) - 1; + __entry->ptr = xe_vma_userptr(vma); + ), + + TP_printk("vma=0x%016llx, asid=0x%05x, start=0x%012llx, end=0x%012llx, ptr=0x%012llx,", + __entry->vma, __entry->asid, __entry->start, + __entry->end, __entry->ptr) +) + +DEFINE_EVENT(xe_vma, xe_vma_flush, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_pagefault, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_acc, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_fail, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_bind, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_pf_bind, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_unbind, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_userptr_rebind_worker, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_userptr_rebind_exec, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_rebind_worker, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_rebind_exec, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_userptr_invalidate, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_usm_invalidate, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_evict, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DEFINE_EVENT(xe_vma, xe_vma_userptr_invalidate_complete, + TP_PROTO(struct xe_vma *vma), + TP_ARGS(vma) +); + +DECLARE_EVENT_CLASS(xe_vm, + TP_PROTO(struct xe_vm *vm), + TP_ARGS(vm), + + TP_STRUCT__entry( + __field(u64, vm) + __field(u32, asid) + ), + + TP_fast_assign( + __entry->vm = (unsigned long)vm; + __entry->asid = vm->usm.asid; + ), + + TP_printk("vm=0x%016llx, asid=0x%05x", __entry->vm, + __entry->asid) +); + +DEFINE_EVENT(xe_vm, xe_vm_kill, + TP_PROTO(struct xe_vm *vm), + TP_ARGS(vm) +); + +DEFINE_EVENT(xe_vm, xe_vm_create, + TP_PROTO(struct xe_vm *vm), + TP_ARGS(vm) +); + +DEFINE_EVENT(xe_vm, xe_vm_free, + TP_PROTO(struct xe_vm *vm), + TP_ARGS(vm) +); + +DEFINE_EVENT(xe_vm, xe_vm_cpu_bind, + TP_PROTO(struct xe_vm *vm), + TP_ARGS(vm) +); + +DEFINE_EVENT(xe_vm, xe_vm_restart, + TP_PROTO(struct xe_vm *vm), + TP_ARGS(vm) +); + +DEFINE_EVENT(xe_vm, xe_vm_rebind_worker_enter, + TP_PROTO(struct xe_vm *vm), + TP_ARGS(vm) +); + +DEFINE_EVENT(xe_vm, xe_vm_rebind_worker_retry, + TP_PROTO(struct xe_vm *vm), + TP_ARGS(vm) +); + +DEFINE_EVENT(xe_vm, xe_vm_rebind_worker_exit, + TP_PROTO(struct xe_vm *vm), + TP_ARGS(vm) +); + +/* GuC */ +DECLARE_EVENT_CLASS(xe_guc_ct_flow_control, + TP_PROTO(u32 _head, u32 _tail, u32 size, u32 space, u32 len), + TP_ARGS(_head, _tail, size, space, len), + + TP_STRUCT__entry( + __field(u32, _head) + __field(u32, _tail) + __field(u32, size) + __field(u32, space) + __field(u32, len) + ), + + TP_fast_assign( + __entry->_head = _head; + __entry->_tail = _tail; + __entry->size = size; + __entry->space = space; + __entry->len = len; + ), + + TP_printk("h2g flow control: head=%u, tail=%u, size=%u, space=%u, len=%u", + __entry->_head, __entry->_tail, __entry->size, + __entry->space, __entry->len) +); + +DEFINE_EVENT(xe_guc_ct_flow_control, xe_guc_ct_h2g_flow_control, + TP_PROTO(u32 _head, u32 _tail, u32 size, u32 space, u32 len), + TP_ARGS(_head, _tail, size, space, len) +); + +DEFINE_EVENT_PRINT(xe_guc_ct_flow_control, xe_guc_ct_g2h_flow_control, + TP_PROTO(u32 _head, u32 _tail, u32 size, u32 space, u32 len), + TP_ARGS(_head, _tail, size, space, len), + + TP_printk("g2h flow control: head=%u, tail=%u, size=%u, space=%u, len=%u", + __entry->_head, __entry->_tail, __entry->size, + __entry->space, __entry->len) +); + +DECLARE_EVENT_CLASS(xe_guc_ctb, + TP_PROTO(u8 gt_id, u32 action, u32 len, u32 _head, u32 tail), + TP_ARGS(gt_id, action, len, _head, tail), + + TP_STRUCT__entry( + __field(u8, gt_id) + __field(u32, action) + __field(u32, len) + __field(u32, tail) + __field(u32, _head) + ), + + TP_fast_assign( + __entry->gt_id = gt_id; + __entry->action = action; + __entry->len = len; + __entry->tail = tail; + __entry->_head = _head; + ), + + TP_printk("gt%d: H2G CTB: action=0x%x, len=%d, tail=%d, head=%d\n", + __entry->gt_id, __entry->action, __entry->len, + __entry->tail, __entry->_head) +); + +DEFINE_EVENT(xe_guc_ctb, xe_guc_ctb_h2g, + TP_PROTO(u8 gt_id, u32 action, u32 len, u32 _head, u32 tail), + TP_ARGS(gt_id, action, len, _head, tail) +); + +DEFINE_EVENT_PRINT(xe_guc_ctb, xe_guc_ctb_g2h, + TP_PROTO(u8 gt_id, u32 action, u32 len, u32 _head, u32 tail), + TP_ARGS(gt_id, action, len, _head, tail), + + TP_printk("gt%d: G2H CTB: action=0x%x, len=%d, tail=%d, head=%d\n", + __entry->gt_id, __entry->action, __entry->len, + __entry->tail, __entry->_head) + +); + +#endif + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH ../../drivers/gpu/drm/xe +#define TRACE_INCLUDE_FILE xe_trace +#include <trace/define_trace.h> diff --git a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c new file mode 100644 index 000000000000..d2b00d0bf1e2 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c @@ -0,0 +1,334 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021-2023 Intel Corporation + * Copyright (C) 2021-2002 Red Hat + */ + +#include <drm/drm_managed.h> +#include <drm/drm_mm.h> + +#include <drm/ttm/ttm_device.h> +#include <drm/ttm/ttm_placement.h> +#include <drm/ttm/ttm_range_manager.h> + +#include "generated/xe_wa_oob.h" +#include "regs/xe_gt_regs.h" +#include "regs/xe_regs.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_mmio.h" +#include "xe_res_cursor.h" +#include "xe_ttm_stolen_mgr.h" +#include "xe_ttm_vram_mgr.h" +#include "xe_wa.h" + +struct xe_ttm_stolen_mgr { + struct xe_ttm_vram_mgr base; + + /* PCI base offset */ + resource_size_t io_base; + /* GPU base offset */ + resource_size_t stolen_base; + + void *__iomem mapping; +}; + +static inline struct xe_ttm_stolen_mgr * +to_stolen_mgr(struct ttm_resource_manager *man) +{ + return container_of(man, struct xe_ttm_stolen_mgr, base.manager); +} + +/** + * xe_ttm_stolen_cpu_access_needs_ggtt() - If we can't directly CPU access + * stolen, can we then fallback to mapping through the GGTT. + * @xe: xe device + * + * Some older integrated platforms don't support reliable CPU access for stolen, + * however on such hardware we can always use the mappable part of the GGTT for + * CPU access. Check if that's the case for this device. + */ +bool xe_ttm_stolen_cpu_access_needs_ggtt(struct xe_device *xe) +{ + return GRAPHICS_VERx100(xe) < 1270 && !IS_DGFX(xe); +} + +static s64 detect_bar2_dgfx(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) +{ + struct xe_tile *tile = xe_device_get_root_tile(xe); + struct xe_gt *mmio = xe_root_mmio_gt(xe); + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + u64 stolen_size; + u64 tile_offset; + u64 tile_size; + + tile_offset = tile->mem.vram.io_start - xe->mem.vram.io_start; + tile_size = tile->mem.vram.actual_physical_size; + + /* Use DSM base address instead for stolen memory */ + mgr->stolen_base = (xe_mmio_read64_2x32(mmio, DSMBASE) & BDSM_MASK) - tile_offset; + if (drm_WARN_ON(&xe->drm, tile_size < mgr->stolen_base)) + return 0; + + stolen_size = tile_size - mgr->stolen_base; + + /* Verify usage fits in the actual resource available */ + if (mgr->stolen_base + stolen_size <= pci_resource_len(pdev, LMEM_BAR)) + mgr->io_base = tile->mem.vram.io_start + mgr->stolen_base; + + /* + * There may be few KB of platform dependent reserved memory at the end + * of vram which is not part of the DSM. Such reserved memory portion is + * always less then DSM granularity so align down the stolen_size to DSM + * granularity to accommodate such reserve vram portion. + */ + return ALIGN_DOWN(stolen_size, SZ_1M); +} + +static u32 get_wopcm_size(struct xe_device *xe) +{ + u32 wopcm_size; + u64 val; + + val = xe_mmio_read64_2x32(xe_root_mmio_gt(xe), STOLEN_RESERVED); + val = REG_FIELD_GET64(WOPCM_SIZE_MASK, val); + + switch (val) { + case 0x5 ... 0x6: + val--; + fallthrough; + case 0x0 ... 0x3: + wopcm_size = (1U << val) * SZ_1M; + break; + default: + WARN(1, "Missing case wopcm_size=%llx\n", val); + wopcm_size = 0; + } + + return wopcm_size; +} + +static u32 detect_bar2_integrated(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct xe_gt *media_gt = xe_device_get_root_tile(xe)->media_gt; + u32 stolen_size, wopcm_size; + u32 ggc, gms; + + ggc = xe_mmio_read32(xe_root_mmio_gt(xe), GGC); + + /* + * Check GGMS: it should be fixed 0x3 (8MB), which corresponds to the + * GTT size + */ + if (drm_WARN_ON(&xe->drm, (ggc & GGMS_MASK) != GGMS_MASK)) + return 0; + + /* + * Graphics >= 1270 uses the offset to the GSMBASE as address in the + * PTEs, together with the DM flag being set. Previously there was no + * such flag so the address was the io_base. + * + * DSMBASE = GSMBASE + 8MB + */ + mgr->stolen_base = SZ_8M; + mgr->io_base = pci_resource_start(pdev, 2) + mgr->stolen_base; + + /* return valid GMS value, -EIO if invalid */ + gms = REG_FIELD_GET(GMS_MASK, ggc); + switch (gms) { + case 0x0 ... 0x04: + stolen_size = gms * 32 * SZ_1M; + break; + case 0xf0 ... 0xfe: + stolen_size = (gms - 0xf0 + 1) * 4 * SZ_1M; + break; + default: + return 0; + } + + /* Carve out the top of DSM as it contains the reserved WOPCM region */ + wopcm_size = get_wopcm_size(xe); + if (drm_WARN_ON(&xe->drm, !wopcm_size)) + return 0; + + stolen_size -= wopcm_size; + + if (media_gt && XE_WA(media_gt, 14019821291)) { + u64 gscpsmi_base = xe_mmio_read64_2x32(media_gt, GSCPSMI_BASE) + & ~GENMASK_ULL(5, 0); + + /* + * This workaround is primarily implemented by the BIOS. We + * just need to figure out whether the BIOS has applied the + * workaround (meaning the programmed address falls within + * the DSM) and, if so, reserve that part of the DSM to + * prevent accidental reuse. The DSM location should be just + * below the WOPCM. + */ + if (gscpsmi_base >= mgr->io_base && + gscpsmi_base < mgr->io_base + stolen_size) { + xe_gt_dbg(media_gt, + "Reserving %llu bytes of DSM for Wa_14019821291\n", + mgr->io_base + stolen_size - gscpsmi_base); + stolen_size = gscpsmi_base - mgr->io_base; + } + } + + if (drm_WARN_ON(&xe->drm, stolen_size + SZ_8M > pci_resource_len(pdev, 2))) + return 0; + + return stolen_size; +} + +extern struct resource intel_graphics_stolen_res; + +static u64 detect_stolen(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) +{ +#ifdef CONFIG_X86 + /* Map into GGTT */ + mgr->io_base = pci_resource_start(to_pci_dev(xe->drm.dev), 2); + + /* Stolen memory is x86 only */ + mgr->stolen_base = intel_graphics_stolen_res.start; + return resource_size(&intel_graphics_stolen_res); +#else + return 0; +#endif +} + +void xe_ttm_stolen_mgr_init(struct xe_device *xe) +{ + struct xe_ttm_stolen_mgr *mgr = drmm_kzalloc(&xe->drm, sizeof(*mgr), GFP_KERNEL); + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + u64 stolen_size, io_size, pgsize; + int err; + + if (IS_DGFX(xe)) + stolen_size = detect_bar2_dgfx(xe, mgr); + else if (GRAPHICS_VERx100(xe) >= 1270) + stolen_size = detect_bar2_integrated(xe, mgr); + else + stolen_size = detect_stolen(xe, mgr); + + if (!stolen_size) { + drm_dbg_kms(&xe->drm, "No stolen memory support\n"); + return; + } + + pgsize = xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K ? SZ_64K : SZ_4K; + if (pgsize < PAGE_SIZE) + pgsize = PAGE_SIZE; + + /* + * We don't try to attempt partial visible support for stolen vram, + * since stolen is always at the end of vram, and the BAR size is pretty + * much always 256M, with small-bar. + */ + io_size = 0; + if (mgr->io_base && !xe_ttm_stolen_cpu_access_needs_ggtt(xe)) + io_size = stolen_size; + + err = __xe_ttm_vram_mgr_init(xe, &mgr->base, XE_PL_STOLEN, stolen_size, + io_size, pgsize); + if (err) { + drm_dbg_kms(&xe->drm, "Stolen mgr init failed: %i\n", err); + return; + } + + drm_dbg_kms(&xe->drm, "Initialized stolen memory support with %llu bytes\n", + stolen_size); + + if (io_size) + mgr->mapping = devm_ioremap_wc(&pdev->dev, mgr->io_base, io_size); +} + +u64 xe_ttm_stolen_io_offset(struct xe_bo *bo, u32 offset) +{ + struct xe_device *xe = xe_bo_device(bo); + struct ttm_resource_manager *ttm_mgr = ttm_manager_type(&xe->ttm, XE_PL_STOLEN); + struct xe_ttm_stolen_mgr *mgr = to_stolen_mgr(ttm_mgr); + struct xe_res_cursor cur; + + XE_WARN_ON(!mgr->io_base); + + if (xe_ttm_stolen_cpu_access_needs_ggtt(xe)) + return mgr->io_base + xe_bo_ggtt_addr(bo) + offset; + + xe_res_first(bo->ttm.resource, offset, 4096, &cur); + return mgr->io_base + cur.start; +} + +static int __xe_ttm_stolen_io_mem_reserve_bar2(struct xe_device *xe, + struct xe_ttm_stolen_mgr *mgr, + struct ttm_resource *mem) +{ + struct xe_res_cursor cur; + + if (!mgr->io_base) + return -EIO; + + xe_res_first(mem, 0, 4096, &cur); + mem->bus.offset = cur.start; + + drm_WARN_ON(&xe->drm, !(mem->placement & TTM_PL_FLAG_CONTIGUOUS)); + + if (mem->placement & TTM_PL_FLAG_CONTIGUOUS && mgr->mapping) + mem->bus.addr = (u8 *)mgr->mapping + mem->bus.offset; + + mem->bus.offset += mgr->io_base; + mem->bus.is_iomem = true; + mem->bus.caching = ttm_write_combined; + + return 0; +} + +static int __xe_ttm_stolen_io_mem_reserve_stolen(struct xe_device *xe, + struct xe_ttm_stolen_mgr *mgr, + struct ttm_resource *mem) +{ +#ifdef CONFIG_X86 + struct xe_bo *bo = ttm_to_xe_bo(mem->bo); + + XE_WARN_ON(IS_DGFX(xe)); + + /* XXX: Require BO to be mapped to GGTT? */ + if (drm_WARN_ON(&xe->drm, !(bo->flags & XE_BO_CREATE_GGTT_BIT))) + return -EIO; + + /* GGTT is always contiguously mapped */ + mem->bus.offset = xe_bo_ggtt_addr(bo) + mgr->io_base; + + mem->bus.is_iomem = true; + mem->bus.caching = ttm_write_combined; + + return 0; +#else + /* How is it even possible to get here without gen12 stolen? */ + drm_WARN_ON(&xe->drm, 1); + return -EIO; +#endif +} + +int xe_ttm_stolen_io_mem_reserve(struct xe_device *xe, struct ttm_resource *mem) +{ + struct ttm_resource_manager *ttm_mgr = ttm_manager_type(&xe->ttm, XE_PL_STOLEN); + struct xe_ttm_stolen_mgr *mgr = ttm_mgr ? to_stolen_mgr(ttm_mgr) : NULL; + + if (!mgr || !mgr->io_base) + return -EIO; + + if (xe_ttm_stolen_cpu_access_needs_ggtt(xe)) + return __xe_ttm_stolen_io_mem_reserve_stolen(xe, mgr, mem); + else + return __xe_ttm_stolen_io_mem_reserve_bar2(xe, mgr, mem); +} + +u64 xe_ttm_stolen_gpu_offset(struct xe_device *xe) +{ + struct xe_ttm_stolen_mgr *mgr = + to_stolen_mgr(ttm_manager_type(&xe->ttm, XE_PL_STOLEN)); + + return mgr->stolen_base; +} diff --git a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.h b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.h new file mode 100644 index 000000000000..1777245ff810 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_TTM_STOLEN_MGR_H_ +#define _XE_TTM_STOLEN_MGR_H_ + +#include <linux/types.h> + +struct ttm_resource; +struct xe_bo; +struct xe_device; + +void xe_ttm_stolen_mgr_init(struct xe_device *xe); +int xe_ttm_stolen_io_mem_reserve(struct xe_device *xe, struct ttm_resource *mem); +bool xe_ttm_stolen_cpu_access_needs_ggtt(struct xe_device *xe); +u64 xe_ttm_stolen_io_offset(struct xe_bo *bo, u32 offset); +u64 xe_ttm_stolen_gpu_offset(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_ttm_sys_mgr.c b/drivers/gpu/drm/xe/xe_ttm_sys_mgr.c new file mode 100644 index 000000000000..3e1fa0c832ca --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ttm_sys_mgr.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021-2022 Intel Corporation + * Copyright (C) 2021-2002 Red Hat + */ + +#include "xe_ttm_sys_mgr.h" + +#include <drm/drm_managed.h> + +#include <drm/ttm/ttm_placement.h> +#include <drm/ttm/ttm_range_manager.h> +#include <drm/ttm/ttm_tt.h> + +#include "xe_bo.h" +#include "xe_gt.h" + +struct xe_ttm_sys_node { + struct ttm_buffer_object *tbo; + struct ttm_range_mgr_node base; +}; + +static inline struct xe_ttm_sys_node * +to_xe_ttm_sys_node(struct ttm_resource *res) +{ + return container_of(res, struct xe_ttm_sys_node, base.base); +} + +static int xe_ttm_sys_mgr_new(struct ttm_resource_manager *man, + struct ttm_buffer_object *tbo, + const struct ttm_place *place, + struct ttm_resource **res) +{ + struct xe_ttm_sys_node *node; + int r; + + node = kzalloc(struct_size(node, base.mm_nodes, 1), GFP_KERNEL); + if (!node) + return -ENOMEM; + + node->tbo = tbo; + ttm_resource_init(tbo, place, &node->base.base); + + if (!(place->flags & TTM_PL_FLAG_TEMPORARY) && + ttm_resource_manager_usage(man) > (man->size << PAGE_SHIFT)) { + r = -ENOSPC; + goto err_fini; + } + + node->base.mm_nodes[0].start = 0; + node->base.mm_nodes[0].size = PFN_UP(node->base.base.size); + node->base.base.start = XE_BO_INVALID_OFFSET; + + *res = &node->base.base; + + return 0; + +err_fini: + ttm_resource_fini(man, &node->base.base); + kfree(node); + return r; +} + +static void xe_ttm_sys_mgr_del(struct ttm_resource_manager *man, + struct ttm_resource *res) +{ + struct xe_ttm_sys_node *node = to_xe_ttm_sys_node(res); + + ttm_resource_fini(man, res); + kfree(node); +} + +static void xe_ttm_sys_mgr_debug(struct ttm_resource_manager *man, + struct drm_printer *printer) +{ + +} + +static const struct ttm_resource_manager_func xe_ttm_sys_mgr_func = { + .alloc = xe_ttm_sys_mgr_new, + .free = xe_ttm_sys_mgr_del, + .debug = xe_ttm_sys_mgr_debug +}; + +static void ttm_sys_mgr_fini(struct drm_device *drm, void *arg) +{ + struct xe_device *xe = (struct xe_device *)arg; + struct ttm_resource_manager *man = &xe->mem.sys_mgr; + int err; + + ttm_resource_manager_set_used(man, false); + + err = ttm_resource_manager_evict_all(&xe->ttm, man); + if (err) + return; + + ttm_resource_manager_cleanup(man); + ttm_set_driver_manager(&xe->ttm, XE_PL_TT, NULL); +} + +int xe_ttm_sys_mgr_init(struct xe_device *xe) +{ + struct ttm_resource_manager *man = &xe->mem.sys_mgr; + struct sysinfo si; + u64 gtt_size; + + si_meminfo(&si); + gtt_size = (u64)si.totalram * si.mem_unit; + /* TTM limits allocation of all TTM devices by 50% of system memory */ + gtt_size /= 2; + + man->use_tt = true; + man->func = &xe_ttm_sys_mgr_func; + ttm_resource_manager_init(man, &xe->ttm, gtt_size >> PAGE_SHIFT); + ttm_set_driver_manager(&xe->ttm, XE_PL_TT, man); + ttm_resource_manager_set_used(man, true); + return drmm_add_action_or_reset(&xe->drm, ttm_sys_mgr_fini, xe); +} diff --git a/drivers/gpu/drm/xe/xe_ttm_sys_mgr.h b/drivers/gpu/drm/xe/xe_ttm_sys_mgr.h new file mode 100644 index 000000000000..e8f5cd395b28 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ttm_sys_mgr.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_TTM_SYS_MGR_H_ +#define _XE_TTM_SYS_MGR_H_ + +struct xe_device; + +int xe_ttm_sys_mgr_init(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c new file mode 100644 index 000000000000..115ec745e502 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -0,0 +1,480 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021-2022 Intel Corporation + * Copyright (C) 2021-2002 Red Hat + */ + +#include <drm/drm_managed.h> + +#include <drm/ttm/ttm_placement.h> +#include <drm/ttm/ttm_range_manager.h> + +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_res_cursor.h" +#include "xe_ttm_vram_mgr.h" + +static inline struct drm_buddy_block * +xe_ttm_vram_mgr_first_block(struct list_head *list) +{ + return list_first_entry_or_null(list, struct drm_buddy_block, link); +} + +static inline bool xe_is_vram_mgr_blocks_contiguous(struct drm_buddy *mm, + struct list_head *head) +{ + struct drm_buddy_block *block; + u64 start, size; + + block = xe_ttm_vram_mgr_first_block(head); + if (!block) + return false; + + while (head != block->link.next) { + start = drm_buddy_block_offset(block); + size = drm_buddy_block_size(mm, block); + + block = list_entry(block->link.next, struct drm_buddy_block, + link); + if (start + size != drm_buddy_block_offset(block)) + return false; + } + + return true; +} + +static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, + struct ttm_buffer_object *tbo, + const struct ttm_place *place, + struct ttm_resource **res) +{ + struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); + struct xe_ttm_vram_mgr_resource *vres; + struct drm_buddy *mm = &mgr->mm; + u64 size, remaining_size, min_page_size; + unsigned long lpfn; + int err; + + lpfn = place->lpfn; + if (!lpfn || lpfn > man->size >> PAGE_SHIFT) + lpfn = man->size >> PAGE_SHIFT; + + if (tbo->base.size >> PAGE_SHIFT > (lpfn - place->fpfn)) + return -E2BIG; /* don't trigger eviction for the impossible */ + + vres = kzalloc(sizeof(*vres), GFP_KERNEL); + if (!vres) + return -ENOMEM; + + ttm_resource_init(tbo, place, &vres->base); + + /* bail out quickly if there's likely not enough VRAM for this BO */ + if (ttm_resource_manager_usage(man) > man->size) { + err = -ENOSPC; + goto error_fini; + } + + INIT_LIST_HEAD(&vres->blocks); + + if (place->flags & TTM_PL_FLAG_TOPDOWN) + vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + + if (place->fpfn || lpfn != man->size >> PAGE_SHIFT) + vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; + + if (WARN_ON(!vres->base.size)) { + err = -EINVAL; + goto error_fini; + } + size = vres->base.size; + + min_page_size = mgr->default_page_size; + if (tbo->page_alignment) + min_page_size = tbo->page_alignment << PAGE_SHIFT; + + if (WARN_ON(min_page_size < mm->chunk_size)) { + err = -EINVAL; + goto error_fini; + } + + if (WARN_ON(min_page_size > SZ_2G)) { /* FIXME: sg limit */ + err = -EINVAL; + goto error_fini; + } + + if (WARN_ON((size > SZ_2G && + (vres->base.placement & TTM_PL_FLAG_CONTIGUOUS)))) { + err = -EINVAL; + goto error_fini; + } + + if (WARN_ON(!IS_ALIGNED(size, min_page_size))) { + err = -EINVAL; + goto error_fini; + } + + mutex_lock(&mgr->lock); + if (lpfn <= mgr->visible_size >> PAGE_SHIFT && size > mgr->visible_avail) { + mutex_unlock(&mgr->lock); + err = -ENOSPC; + goto error_fini; + } + + if (place->fpfn + (size >> PAGE_SHIFT) != place->lpfn && + place->flags & TTM_PL_FLAG_CONTIGUOUS) { + size = roundup_pow_of_two(size); + min_page_size = size; + + lpfn = max_t(unsigned long, place->fpfn + (size >> PAGE_SHIFT), lpfn); + } + + remaining_size = size; + do { + /* + * Limit maximum size to 2GiB due to SG table limitations. + * FIXME: Should maybe be handled as part of sg construction. + */ + u64 alloc_size = min_t(u64, remaining_size, SZ_2G); + + err = drm_buddy_alloc_blocks(mm, (u64)place->fpfn << PAGE_SHIFT, + (u64)lpfn << PAGE_SHIFT, + alloc_size, + min_page_size, + &vres->blocks, + vres->flags); + if (err) + goto error_free_blocks; + + remaining_size -= alloc_size; + } while (remaining_size); + + if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { + if (!drm_buddy_block_trim(mm, vres->base.size, &vres->blocks)) + size = vres->base.size; + } + + if (lpfn <= mgr->visible_size >> PAGE_SHIFT) { + vres->used_visible_size = size; + } else { + struct drm_buddy_block *block; + + list_for_each_entry(block, &vres->blocks, link) { + u64 start = drm_buddy_block_offset(block); + + if (start < mgr->visible_size) { + u64 end = start + drm_buddy_block_size(mm, block); + + vres->used_visible_size += + min(end, mgr->visible_size) - start; + } + } + } + + mgr->visible_avail -= vres->used_visible_size; + mutex_unlock(&mgr->lock); + + if (!(vres->base.placement & TTM_PL_FLAG_CONTIGUOUS) && + xe_is_vram_mgr_blocks_contiguous(mm, &vres->blocks)) + vres->base.placement |= TTM_PL_FLAG_CONTIGUOUS; + + /* + * For some kernel objects we still rely on the start when io mapping + * the object. + */ + if (vres->base.placement & TTM_PL_FLAG_CONTIGUOUS) { + struct drm_buddy_block *block = list_first_entry(&vres->blocks, + typeof(*block), + link); + + vres->base.start = drm_buddy_block_offset(block) >> PAGE_SHIFT; + } else { + vres->base.start = XE_BO_INVALID_OFFSET; + } + + *res = &vres->base; + return 0; + +error_free_blocks: + drm_buddy_free_list(mm, &vres->blocks); + mutex_unlock(&mgr->lock); +error_fini: + ttm_resource_fini(man, &vres->base); + kfree(vres); + + return err; +} + +static void xe_ttm_vram_mgr_del(struct ttm_resource_manager *man, + struct ttm_resource *res) +{ + struct xe_ttm_vram_mgr_resource *vres = + to_xe_ttm_vram_mgr_resource(res); + struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); + struct drm_buddy *mm = &mgr->mm; + + mutex_lock(&mgr->lock); + drm_buddy_free_list(mm, &vres->blocks); + mgr->visible_avail += vres->used_visible_size; + mutex_unlock(&mgr->lock); + + ttm_resource_fini(man, res); + + kfree(vres); +} + +static void xe_ttm_vram_mgr_debug(struct ttm_resource_manager *man, + struct drm_printer *printer) +{ + struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); + struct drm_buddy *mm = &mgr->mm; + + mutex_lock(&mgr->lock); + drm_printf(printer, "default_page_size: %lluKiB\n", + mgr->default_page_size >> 10); + drm_printf(printer, "visible_avail: %lluMiB\n", + (u64)mgr->visible_avail >> 20); + drm_printf(printer, "visible_size: %lluMiB\n", + (u64)mgr->visible_size >> 20); + + drm_buddy_print(mm, printer); + mutex_unlock(&mgr->lock); + drm_printf(printer, "man size:%llu\n", man->size); +} + +static bool xe_ttm_vram_mgr_intersects(struct ttm_resource_manager *man, + struct ttm_resource *res, + const struct ttm_place *place, + size_t size) +{ + struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); + struct xe_ttm_vram_mgr_resource *vres = + to_xe_ttm_vram_mgr_resource(res); + struct drm_buddy *mm = &mgr->mm; + struct drm_buddy_block *block; + + if (!place->fpfn && !place->lpfn) + return true; + + if (!place->fpfn && place->lpfn == mgr->visible_size >> PAGE_SHIFT) + return vres->used_visible_size > 0; + + list_for_each_entry(block, &vres->blocks, link) { + unsigned long fpfn = + drm_buddy_block_offset(block) >> PAGE_SHIFT; + unsigned long lpfn = fpfn + + (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + + if (place->fpfn < lpfn && place->lpfn > fpfn) + return true; + } + + return false; +} + +static bool xe_ttm_vram_mgr_compatible(struct ttm_resource_manager *man, + struct ttm_resource *res, + const struct ttm_place *place, + size_t size) +{ + struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); + struct xe_ttm_vram_mgr_resource *vres = + to_xe_ttm_vram_mgr_resource(res); + struct drm_buddy *mm = &mgr->mm; + struct drm_buddy_block *block; + + if (!place->fpfn && !place->lpfn) + return true; + + if (!place->fpfn && place->lpfn == mgr->visible_size >> PAGE_SHIFT) + return vres->used_visible_size == size; + + list_for_each_entry(block, &vres->blocks, link) { + unsigned long fpfn = + drm_buddy_block_offset(block) >> PAGE_SHIFT; + unsigned long lpfn = fpfn + + (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + + if (fpfn < place->fpfn || lpfn > place->lpfn) + return false; + } + + return true; +} + +static const struct ttm_resource_manager_func xe_ttm_vram_mgr_func = { + .alloc = xe_ttm_vram_mgr_new, + .free = xe_ttm_vram_mgr_del, + .intersects = xe_ttm_vram_mgr_intersects, + .compatible = xe_ttm_vram_mgr_compatible, + .debug = xe_ttm_vram_mgr_debug +}; + +static void ttm_vram_mgr_fini(struct drm_device *dev, void *arg) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_ttm_vram_mgr *mgr = arg; + struct ttm_resource_manager *man = &mgr->manager; + + ttm_resource_manager_set_used(man, false); + + if (ttm_resource_manager_evict_all(&xe->ttm, man)) + return; + + WARN_ON_ONCE(mgr->visible_avail != mgr->visible_size); + + drm_buddy_fini(&mgr->mm); + + ttm_resource_manager_cleanup(&mgr->manager); + + ttm_set_driver_manager(&xe->ttm, mgr->mem_type, NULL); + + mutex_destroy(&mgr->lock); +} + +int __xe_ttm_vram_mgr_init(struct xe_device *xe, struct xe_ttm_vram_mgr *mgr, + u32 mem_type, u64 size, u64 io_size, + u64 default_page_size) +{ + struct ttm_resource_manager *man = &mgr->manager; + int err; + + man->func = &xe_ttm_vram_mgr_func; + mgr->mem_type = mem_type; + mutex_init(&mgr->lock); + mgr->default_page_size = default_page_size; + mgr->visible_size = io_size; + mgr->visible_avail = io_size; + + ttm_resource_manager_init(man, &xe->ttm, size); + err = drm_buddy_init(&mgr->mm, man->size, default_page_size); + if (err) + return err; + + ttm_set_driver_manager(&xe->ttm, mem_type, &mgr->manager); + ttm_resource_manager_set_used(&mgr->manager, true); + + return drmm_add_action_or_reset(&xe->drm, ttm_vram_mgr_fini, mgr); +} + +int xe_ttm_vram_mgr_init(struct xe_tile *tile, struct xe_ttm_vram_mgr *mgr) +{ + struct xe_device *xe = tile_to_xe(tile); + struct xe_mem_region *vram = &tile->mem.vram; + + mgr->vram = vram; + return __xe_ttm_vram_mgr_init(xe, mgr, XE_PL_VRAM0 + tile->id, + vram->usable_size, vram->io_size, + PAGE_SIZE); +} + +int xe_ttm_vram_mgr_alloc_sgt(struct xe_device *xe, + struct ttm_resource *res, + u64 offset, u64 length, + struct device *dev, + enum dma_data_direction dir, + struct sg_table **sgt) +{ + struct xe_tile *tile = &xe->tiles[res->mem_type - XE_PL_VRAM0]; + struct xe_ttm_vram_mgr_resource *vres = to_xe_ttm_vram_mgr_resource(res); + struct xe_res_cursor cursor; + struct scatterlist *sg; + int num_entries = 0; + int i, r; + + if (vres->used_visible_size < res->size) + return -EOPNOTSUPP; + + *sgt = kmalloc(sizeof(**sgt), GFP_KERNEL); + if (!*sgt) + return -ENOMEM; + + /* Determine the number of DRM_BUDDY blocks to export */ + xe_res_first(res, offset, length, &cursor); + while (cursor.remaining) { + num_entries++; + xe_res_next(&cursor, cursor.size); + } + + r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); + if (r) + goto error_free; + + /* Initialize scatterlist nodes of sg_table */ + for_each_sgtable_sg((*sgt), sg, i) + sg->length = 0; + + /* + * Walk down DRM_BUDDY blocks to populate scatterlist nodes + * @note: Use iterator api to get first the DRM_BUDDY block + * and the number of bytes from it. Access the following + * DRM_BUDDY block(s) if more buffer needs to exported + */ + xe_res_first(res, offset, length, &cursor); + for_each_sgtable_sg((*sgt), sg, i) { + phys_addr_t phys = cursor.start + tile->mem.vram.io_start; + size_t size = cursor.size; + dma_addr_t addr; + + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + + sg_set_page(sg, NULL, size, 0); + sg_dma_address(sg) = addr; + sg_dma_len(sg) = size; + + xe_res_next(&cursor, cursor.size); + } + + return 0; + +error_unmap: + for_each_sgtable_sg((*sgt), sg, i) { + if (!sg->length) + continue; + + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } + sg_free_table(*sgt); + +error_free: + kfree(*sgt); + return r; +} + +void xe_ttm_vram_mgr_free_sgt(struct device *dev, enum dma_data_direction dir, + struct sg_table *sgt) +{ + struct scatterlist *sg; + int i; + + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + sg_free_table(sgt); + kfree(sgt); +} + +u64 xe_ttm_vram_get_cpu_visible_size(struct ttm_resource_manager *man) +{ + struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); + + return mgr->visible_size; +} + +void xe_ttm_vram_get_used(struct ttm_resource_manager *man, + u64 *used, u64 *used_visible) +{ + struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); + + mutex_lock(&mgr->lock); + *used = mgr->mm.size - mgr->mm.avail; + *used_visible = mgr->visible_size - mgr->visible_avail; + mutex_unlock(&mgr->lock); +} diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h new file mode 100644 index 000000000000..d184e19a9230 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_TTM_VRAM_MGR_H_ +#define _XE_TTM_VRAM_MGR_H_ + +#include "xe_ttm_vram_mgr_types.h" + +enum dma_data_direction; +struct xe_device; +struct xe_tile; + +int __xe_ttm_vram_mgr_init(struct xe_device *xe, struct xe_ttm_vram_mgr *mgr, + u32 mem_type, u64 size, u64 io_size, + u64 default_page_size); +int xe_ttm_vram_mgr_init(struct xe_tile *tile, struct xe_ttm_vram_mgr *mgr); +int xe_ttm_vram_mgr_alloc_sgt(struct xe_device *xe, + struct ttm_resource *res, + u64 offset, u64 length, + struct device *dev, + enum dma_data_direction dir, + struct sg_table **sgt); +void xe_ttm_vram_mgr_free_sgt(struct device *dev, enum dma_data_direction dir, + struct sg_table *sgt); + +u64 xe_ttm_vram_get_cpu_visible_size(struct ttm_resource_manager *man); +void xe_ttm_vram_get_used(struct ttm_resource_manager *man, + u64 *used, u64 *used_visible); + +static inline struct xe_ttm_vram_mgr_resource * +to_xe_ttm_vram_mgr_resource(struct ttm_resource *res) +{ + return container_of(res, struct xe_ttm_vram_mgr_resource, base); +} + +static inline struct xe_ttm_vram_mgr * +to_xe_ttm_vram_mgr(struct ttm_resource_manager *man) +{ + return container_of(man, struct xe_ttm_vram_mgr, manager); +} + +#endif diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h new file mode 100644 index 000000000000..2d75cf126289 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_TTM_VRAM_MGR_TYPES_H_ +#define _XE_TTM_VRAM_MGR_TYPES_H_ + +#include <drm/drm_buddy.h> +#include <drm/ttm/ttm_device.h> + +struct xe_mem_region; + +/** + * struct xe_ttm_vram_mgr - XE TTM VRAM manager + * + * Manages placement of TTM resource in VRAM. + */ +struct xe_ttm_vram_mgr { + /** @manager: Base TTM resource manager */ + struct ttm_resource_manager manager; + /** @mm: DRM buddy allocator which manages the VRAM */ + struct drm_buddy mm; + /** @vram: ptr to details of associated VRAM region */ + struct xe_mem_region *vram; + /** @visible_size: Proped size of the CPU visible portion */ + u64 visible_size; + /** @visible_avail: CPU visible portion still unallocated */ + u64 visible_avail; + /** @default_page_size: default page size */ + u64 default_page_size; + /** @lock: protects allocations of VRAM */ + struct mutex lock; + /** @mem_type: The TTM memory type */ + u32 mem_type; +}; + +/** + * struct xe_ttm_vram_mgr_resource - XE TTM VRAM resource + */ +struct xe_ttm_vram_mgr_resource { + /** @base: Base TTM resource */ + struct ttm_resource base; + /** @blocks: list of DRM buddy blocks */ + struct list_head blocks; + /** @used_visible_size: How many CPU visible bytes this resource is using */ + u64 used_visible_size; + /** @flags: flags associated with the resource */ + unsigned long flags; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_tuning.c b/drivers/gpu/drm/xe/xe_tuning.c new file mode 100644 index 000000000000..53ccd338fd8c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_tuning.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_tuning.h" + +#include <kunit/visibility.h> + +#include "regs/xe_gt_regs.h" +#include "xe_gt_types.h" +#include "xe_platform_types.h" +#include "xe_rtp.h" + +#undef XE_REG_MCR +#define XE_REG_MCR(...) XE_REG(__VA_ARGS__, .mcr = 1) + +static const struct xe_rtp_entry_sr gt_tunings[] = { + { XE_RTP_NAME("Tuning: Blend Fill Caching Optimization Disable"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(SET(XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS)) + }, + { XE_RTP_NAME("Tuning: 32B Access Enable"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(SET(XEHP_SQCM, EN_32B_ACCESS)) + }, + + /* Xe2 */ + + { XE_RTP_NAME("Tuning: L3 cache"), + XE_RTP_RULES(GRAPHICS_VERSION(2004)), + XE_RTP_ACTIONS(FIELD_SET(XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK, + REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f))) + }, + { XE_RTP_NAME("Tuning: L3 cache - media"), + XE_RTP_RULES(MEDIA_VERSION(2000)), + XE_RTP_ACTIONS(FIELD_SET(XE2LPM_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK, + REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f))) + }, + + {} +}; + +static const struct xe_rtp_entry_sr engine_tunings[] = { + { XE_RTP_NAME("Tuning: Set Indirect State Override"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1271), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(SAMPLER_MODE, INDIRECT_STATE_BASE_ADDR_OVERRIDE)) + }, + {} +}; + +static const struct xe_rtp_entry_sr lrc_tunings[] = { + { XE_RTP_NAME("Tuning: ganged timer, also known as 16011163337"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210), ENGINE_CLASS(RENDER)), + /* read verification is ignored due to 1608008084. */ + XE_RTP_ACTIONS(FIELD_SET_NO_READ_MASK(FF_MODE2, + FF_MODE2_GS_TIMER_MASK, + FF_MODE2_GS_TIMER_224)) + }, + + /* DG2 */ + + { XE_RTP_NAME("Tuning: L3 cache"), + XE_RTP_RULES(PLATFORM(DG2), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(FIELD_SET(XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK, + REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f))) + }, + { XE_RTP_NAME("Tuning: TDS gang timer"), + XE_RTP_RULES(PLATFORM(DG2), ENGINE_CLASS(RENDER)), + /* read verification is ignored as in i915 - need to check enabling */ + XE_RTP_ACTIONS(FIELD_SET_NO_READ_MASK(XEHP_FF_MODE2, + FF_MODE2_TDS_TIMER_MASK, + FF_MODE2_TDS_TIMER_128)) + }, + { XE_RTP_NAME("Tuning: TBIMR fast clip"), + XE_RTP_RULES(PLATFORM(DG2), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(CHICKEN_RASTER_2, TBIMR_FAST_CLIP)) + }, + + /* Xe_LPG */ + + { XE_RTP_NAME("Tuning: L3 cache"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1270, 1271), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(FIELD_SET(XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK, + REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f))) + }, + + {} +}; + +void xe_tuning_process_gt(struct xe_gt *gt) +{ + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(gt); + + xe_rtp_process_to_sr(&ctx, gt_tunings, >->reg_sr); +} +EXPORT_SYMBOL_IF_KUNIT(xe_tuning_process_gt); + +void xe_tuning_process_engine(struct xe_hw_engine *hwe) +{ + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe); + + xe_rtp_process_to_sr(&ctx, engine_tunings, &hwe->reg_sr); +} +EXPORT_SYMBOL_IF_KUNIT(xe_tuning_process_engine); + +/** + * xe_tuning_process_lrc - process lrc tunings + * @hwe: engine instance to process tunings for + * + * Process LRC table for this platform, saving in @hwe all the tunings that need + * to be applied on context restore. These are tunings touching registers that + * are part of the HW context image. + */ +void xe_tuning_process_lrc(struct xe_hw_engine *hwe) +{ + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe); + + xe_rtp_process_to_sr(&ctx, lrc_tunings, &hwe->reg_lrc); +} diff --git a/drivers/gpu/drm/xe/xe_tuning.h b/drivers/gpu/drm/xe/xe_tuning.h new file mode 100644 index 000000000000..4f9c3ac3b516 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_tuning.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_TUNING_ +#define _XE_TUNING_ + +struct xe_gt; +struct xe_hw_engine; + +void xe_tuning_process_gt(struct xe_gt *gt); +void xe_tuning_process_engine(struct xe_hw_engine *hwe); +void xe_tuning_process_lrc(struct xe_hw_engine *hwe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c new file mode 100644 index 000000000000..25e1ddfd2f86 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_uc.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_uc.h" + +#include "xe_device.h" +#include "xe_gsc.h" +#include "xe_gt.h" +#include "xe_guc.h" +#include "xe_guc_pc.h" +#include "xe_guc_submit.h" +#include "xe_huc.h" +#include "xe_uc_fw.h" +#include "xe_wopcm.h" + +static struct xe_gt * +uc_to_gt(struct xe_uc *uc) +{ + return container_of(uc, struct xe_gt, uc); +} + +static struct xe_device * +uc_to_xe(struct xe_uc *uc) +{ + return gt_to_xe(uc_to_gt(uc)); +} + +/* Should be called once at driver load only */ +int xe_uc_init(struct xe_uc *uc) +{ + int ret; + + /* + * We call the GuC/HuC/GSC init functions even if GuC submission is off + * to correctly move our tracking of the FW state to "disabled". + */ + + ret = xe_guc_init(&uc->guc); + if (ret) + goto err; + + ret = xe_huc_init(&uc->huc); + if (ret) + goto err; + + ret = xe_gsc_init(&uc->gsc); + if (ret) + goto err; + + if (!xe_device_uc_enabled(uc_to_xe(uc))) + return 0; + + ret = xe_wopcm_init(&uc->wopcm); + if (ret) + goto err; + + ret = xe_guc_submit_init(&uc->guc); + if (ret) + goto err; + + return 0; + +err: + return ret; +} + +/** + * xe_uc_init_post_hwconfig - init Uc post hwconfig load + * @uc: The UC object + * + * Return: 0 on success, negative error code on error. + */ +int xe_uc_init_post_hwconfig(struct xe_uc *uc) +{ + int err; + + /* GuC submission not enabled, nothing to do */ + if (!xe_device_uc_enabled(uc_to_xe(uc))) + return 0; + + err = xe_uc_sanitize_reset(uc); + if (err) + return err; + + err = xe_guc_init_post_hwconfig(&uc->guc); + if (err) + return err; + + return xe_gsc_init_post_hwconfig(&uc->gsc); +} + +static int uc_reset(struct xe_uc *uc) +{ + struct xe_device *xe = uc_to_xe(uc); + int ret; + + ret = xe_guc_reset(&uc->guc); + if (ret) { + drm_err(&xe->drm, "Failed to reset GuC, ret = %d\n", ret); + return ret; + } + + return 0; +} + +static void xe_uc_sanitize(struct xe_uc *uc) +{ + xe_huc_sanitize(&uc->huc); + xe_guc_sanitize(&uc->guc); +} + +int xe_uc_sanitize_reset(struct xe_uc *uc) +{ + xe_uc_sanitize(uc); + + return uc_reset(uc); +} + +/** + * xe_uc_init_hwconfig - minimally init Uc, read and parse hwconfig + * @uc: The UC object + * + * Return: 0 on success, negative error code on error. + */ +int xe_uc_init_hwconfig(struct xe_uc *uc) +{ + int ret; + + /* GuC submission not enabled, nothing to do */ + if (!xe_device_uc_enabled(uc_to_xe(uc))) + return 0; + + ret = xe_guc_min_load_for_hwconfig(&uc->guc); + if (ret) + return ret; + + return 0; +} + +/* + * Should be called during driver load, after every GT reset, and after every + * suspend to reload / auth the firmwares. + */ +int xe_uc_init_hw(struct xe_uc *uc) +{ + int ret; + + /* GuC submission not enabled, nothing to do */ + if (!xe_device_uc_enabled(uc_to_xe(uc))) + return 0; + + ret = xe_huc_upload(&uc->huc); + if (ret) + return ret; + + ret = xe_guc_upload(&uc->guc); + if (ret) + return ret; + + ret = xe_guc_enable_communication(&uc->guc); + if (ret) + return ret; + + ret = xe_gt_record_default_lrcs(uc_to_gt(uc)); + if (ret) + return ret; + + ret = xe_guc_post_load_init(&uc->guc); + if (ret) + return ret; + + ret = xe_guc_pc_start(&uc->guc.pc); + if (ret) + return ret; + + /* We don't fail the driver load if HuC fails to auth, but let's warn */ + ret = xe_huc_auth(&uc->huc, XE_HUC_AUTH_VIA_GUC); + xe_gt_assert(uc_to_gt(uc), !ret); + + /* GSC load is async */ + xe_gsc_load_start(&uc->gsc); + + return 0; +} + +int xe_uc_fini_hw(struct xe_uc *uc) +{ + return xe_uc_sanitize_reset(uc); +} + +int xe_uc_reset_prepare(struct xe_uc *uc) +{ + /* GuC submission not enabled, nothing to do */ + if (!xe_device_uc_enabled(uc_to_xe(uc))) + return 0; + + return xe_guc_reset_prepare(&uc->guc); +} + +void xe_uc_gucrc_disable(struct xe_uc *uc) +{ + XE_WARN_ON(xe_guc_pc_gucrc_disable(&uc->guc.pc)); +} + +void xe_uc_stop_prepare(struct xe_uc *uc) +{ + xe_gsc_wait_for_worker_completion(&uc->gsc); + xe_guc_stop_prepare(&uc->guc); +} + +int xe_uc_stop(struct xe_uc *uc) +{ + /* GuC submission not enabled, nothing to do */ + if (!xe_device_uc_enabled(uc_to_xe(uc))) + return 0; + + return xe_guc_stop(&uc->guc); +} + +int xe_uc_start(struct xe_uc *uc) +{ + /* GuC submission not enabled, nothing to do */ + if (!xe_device_uc_enabled(uc_to_xe(uc))) + return 0; + + return xe_guc_start(&uc->guc); +} + +static void uc_reset_wait(struct xe_uc *uc) +{ + int ret; + +again: + xe_guc_reset_wait(&uc->guc); + + ret = xe_uc_reset_prepare(uc); + if (ret) + goto again; +} + +int xe_uc_suspend(struct xe_uc *uc) +{ + int ret; + + /* GuC submission not enabled, nothing to do */ + if (!xe_device_uc_enabled(uc_to_xe(uc))) + return 0; + + uc_reset_wait(uc); + + ret = xe_uc_stop(uc); + if (ret) + return ret; + + return xe_guc_suspend(&uc->guc); +} diff --git a/drivers/gpu/drm/xe/xe_uc.h b/drivers/gpu/drm/xe/xe_uc.h new file mode 100644 index 000000000000..5d5110c0c834 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_uc.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_UC_H_ +#define _XE_UC_H_ + +#include "xe_uc_types.h" + +int xe_uc_init(struct xe_uc *uc); +int xe_uc_init_hwconfig(struct xe_uc *uc); +int xe_uc_init_post_hwconfig(struct xe_uc *uc); +int xe_uc_init_hw(struct xe_uc *uc); +int xe_uc_fini_hw(struct xe_uc *uc); +void xe_uc_gucrc_disable(struct xe_uc *uc); +int xe_uc_reset_prepare(struct xe_uc *uc); +void xe_uc_stop_prepare(struct xe_uc *uc); +int xe_uc_stop(struct xe_uc *uc); +int xe_uc_start(struct xe_uc *uc); +int xe_uc_suspend(struct xe_uc *uc); +int xe_uc_sanitize_reset(struct xe_uc *uc); + +#endif diff --git a/drivers/gpu/drm/xe/xe_uc_debugfs.c b/drivers/gpu/drm/xe/xe_uc_debugfs.c new file mode 100644 index 000000000000..0a39ec5a6e99 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_uc_debugfs.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include <drm/drm_debugfs.h> + +#include "xe_gt.h" +#include "xe_guc_debugfs.h" +#include "xe_huc_debugfs.h" +#include "xe_macros.h" +#include "xe_uc_debugfs.h" + +void xe_uc_debugfs_register(struct xe_uc *uc, struct dentry *parent) +{ + struct dentry *root; + + root = debugfs_create_dir("uc", parent); + if (IS_ERR(root)) { + XE_WARN_ON("Create UC directory failed"); + return; + } + + xe_guc_debugfs_register(&uc->guc, root); + xe_huc_debugfs_register(&uc->huc, root); +} diff --git a/drivers/gpu/drm/xe/xe_uc_debugfs.h b/drivers/gpu/drm/xe/xe_uc_debugfs.h new file mode 100644 index 000000000000..a13382df2bd7 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_uc_debugfs.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_UC_DEBUGFS_H_ +#define _XE_UC_DEBUGFS_H_ + +struct dentry; +struct xe_uc; + +void xe_uc_debugfs_register(struct xe_uc *uc, struct dentry *parent); + +#endif diff --git a/drivers/gpu/drm/xe/xe_uc_fw.c b/drivers/gpu/drm/xe/xe_uc_fw.c new file mode 100644 index 000000000000..9dff96dfe455 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_uc_fw.c @@ -0,0 +1,882 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include <linux/bitfield.h> +#include <linux/firmware.h> + +#include <drm/drm_managed.h> + +#include "regs/xe_guc_regs.h" +#include "xe_bo.h" +#include "xe_device_types.h" +#include "xe_force_wake.h" +#include "xe_gsc.h" +#include "xe_gt.h" +#include "xe_map.h" +#include "xe_mmio.h" +#include "xe_module.h" +#include "xe_uc_fw.h" + +/* + * List of required GuC and HuC binaries per-platform. They must be ordered + * based on platform, from newer to older. + * + * Versioning follows the guidelines from + * Documentation/driver-api/firmware/firmware-usage-guidelines.rst. There is a + * distinction for platforms being officially supported by the driver or not. + * Platforms not available publicly or not yet officially supported by the + * driver (under force-probe), use the mmp_ver(): the firmware autoselect logic + * will select the firmware from disk with filename that matches the full + * "mpp version", i.e. major.minor.patch. mmp_ver() should only be used for + * this case. + * + * For platforms officially supported by the driver, the filename always only + * ever contains the major version (GuC) or no version at all (HuC). + * + * After loading the file, the driver parses the versions embedded in the blob. + * The major version needs to match a major version supported by the driver (if + * any). The minor version is also checked and a notice emitted to the log if + * the version found is smaller than the version wanted. This is done only for + * informational purposes so users may have a chance to upgrade, but the driver + * still loads and use the older firmware. + * + * Examples: + * + * 1) Platform officially supported by i915 - using Tigerlake as example. + * Driver loads the following firmware blobs from disk: + * + * - i915/tgl_guc_<major>.bin + * - i915/tgl_huc.bin + * + * <major> number for GuC is checked that it matches the version inside + * the blob. <minor> version is checked and if smaller than the expected + * an info message is emitted about that. + * + * 1) XE_<FUTUREINTELPLATFORM>, still under require_force_probe. Using + * "wipplat" as a short-name. Driver loads the following firmware blobs + * from disk: + * + * - xe/wipplat_guc_<major>.<minor>.<patch>.bin + * - xe/wipplat_huc_<major>.<minor>.<patch>.bin + * + * <major> and <minor> are checked that they match the version inside + * the blob. Both of them need to match exactly what the driver is + * expecting, otherwise it fails. + * + * 3) Platform officially supported by xe and out of force-probe. Using + * "plat" as a short-name. Except for the different directory, the + * behavior is the same as (1). Driver loads the following firmware + * blobs from disk: + * + * - xe/plat_guc_<major>.bin + * - xe/plat_huc.bin + * + * <major> number for GuC is checked that it matches the version inside + * the blob. <minor> version is checked and if smaller than the expected + * an info message is emitted about that. + * + * For the platforms already released with a major version, they should never be + * removed from the table. Instead new entries with newer versions may be added + * before them, so they take precedence. + * + * TODO: Currently there's no fallback on major version. That's because xe + * driver only supports the one major version of each firmware in the table. + * This needs to be fixed when the major version of GuC is updated. + */ + +struct uc_fw_entry { + enum xe_platform platform; + struct { + const char *path; + u16 major; + u16 minor; + bool full_ver_required; + }; +}; + +struct fw_blobs_by_type { + const struct uc_fw_entry *entries; + u32 count; +}; + +#define XE_GUC_FIRMWARE_DEFS(fw_def, mmp_ver, major_ver) \ + fw_def(METEORLAKE, major_ver(i915, guc, mtl, 70, 7)) \ + fw_def(DG2, major_ver(i915, guc, dg2, 70, 5)) \ + fw_def(DG1, major_ver(i915, guc, dg1, 70, 5)) \ + fw_def(ALDERLAKE_N, major_ver(i915, guc, tgl, 70, 5)) \ + fw_def(ALDERLAKE_P, major_ver(i915, guc, adlp, 70, 5)) \ + fw_def(ALDERLAKE_S, major_ver(i915, guc, tgl, 70, 5)) \ + fw_def(ROCKETLAKE, major_ver(i915, guc, tgl, 70, 5)) \ + fw_def(TIGERLAKE, major_ver(i915, guc, tgl, 70, 5)) + +#define XE_HUC_FIRMWARE_DEFS(fw_def, mmp_ver, no_ver) \ + fw_def(METEORLAKE, no_ver(i915, huc_gsc, mtl)) \ + fw_def(DG1, no_ver(i915, huc, dg1)) \ + fw_def(ALDERLAKE_P, no_ver(i915, huc, tgl)) \ + fw_def(ALDERLAKE_S, no_ver(i915, huc, tgl)) \ + fw_def(ROCKETLAKE, no_ver(i915, huc, tgl)) \ + fw_def(TIGERLAKE, no_ver(i915, huc, tgl)) + +/* for the GSC FW we match the compatibility version and not the release one */ +#define XE_GSC_FIRMWARE_DEFS(fw_def, major_ver) \ + fw_def(METEORLAKE, major_ver(i915, gsc, mtl, 1, 0)) + +#define MAKE_FW_PATH(dir__, uc__, shortname__, version__) \ + __stringify(dir__) "/" __stringify(shortname__) "_" __stringify(uc__) version__ ".bin" + +#define fw_filename_mmp_ver(dir_, uc_, shortname_, a, b, c) \ + MAKE_FW_PATH(dir_, uc_, shortname_, "_" __stringify(a ## . ## b ## . ## c)) +#define fw_filename_major_ver(dir_, uc_, shortname_, a, b) \ + MAKE_FW_PATH(dir_, uc_, shortname_, "_" __stringify(a)) +#define fw_filename_no_ver(dir_, uc_, shortname_) \ + MAKE_FW_PATH(dir_, uc_, shortname_, "") + +#define uc_fw_entry_mmp_ver(dir_, uc_, shortname_, a, b, c) \ + { fw_filename_mmp_ver(dir_, uc_, shortname_, a, b, c), \ + a, b, true } +#define uc_fw_entry_major_ver(dir_, uc_, shortname_, a, b) \ + { fw_filename_major_ver(dir_, uc_, shortname_, a, b), \ + a, b } +#define uc_fw_entry_no_ver(dir_, uc_, shortname_) \ + { fw_filename_no_ver(dir_, uc_, shortname_), \ + 0, 0 } + +/* All blobs need to be declared via MODULE_FIRMWARE() */ +#define XE_UC_MODULE_FIRMWARE(platform__, fw_filename) \ + MODULE_FIRMWARE(fw_filename); + +#define XE_UC_FW_ENTRY(platform__, entry__) \ + { \ + .platform = XE_ ## platform__, \ + entry__, \ + }, + +XE_GUC_FIRMWARE_DEFS(XE_UC_MODULE_FIRMWARE, + fw_filename_mmp_ver, fw_filename_major_ver) +XE_HUC_FIRMWARE_DEFS(XE_UC_MODULE_FIRMWARE, + fw_filename_mmp_ver, fw_filename_no_ver) +XE_GSC_FIRMWARE_DEFS(XE_UC_MODULE_FIRMWARE, fw_filename_major_ver) + +static struct xe_gt * +__uc_fw_to_gt(struct xe_uc_fw *uc_fw, enum xe_uc_fw_type type) +{ + XE_WARN_ON(type >= XE_UC_FW_NUM_TYPES); + + switch (type) { + case XE_UC_FW_TYPE_GUC: + return container_of(uc_fw, struct xe_gt, uc.guc.fw); + case XE_UC_FW_TYPE_HUC: + return container_of(uc_fw, struct xe_gt, uc.huc.fw); + case XE_UC_FW_TYPE_GSC: + return container_of(uc_fw, struct xe_gt, uc.gsc.fw); + default: + return NULL; + } +} + +static struct xe_gt *uc_fw_to_gt(struct xe_uc_fw *uc_fw) +{ + return __uc_fw_to_gt(uc_fw, uc_fw->type); +} + +static struct xe_device *uc_fw_to_xe(struct xe_uc_fw *uc_fw) +{ + return gt_to_xe(uc_fw_to_gt(uc_fw)); +} + +static void +uc_fw_auto_select(struct xe_device *xe, struct xe_uc_fw *uc_fw) +{ + static const struct uc_fw_entry entries_guc[] = { + XE_GUC_FIRMWARE_DEFS(XE_UC_FW_ENTRY, + uc_fw_entry_mmp_ver, + uc_fw_entry_major_ver) + }; + static const struct uc_fw_entry entries_huc[] = { + XE_HUC_FIRMWARE_DEFS(XE_UC_FW_ENTRY, + uc_fw_entry_mmp_ver, + uc_fw_entry_no_ver) + }; + static const struct uc_fw_entry entries_gsc[] = { + XE_GSC_FIRMWARE_DEFS(XE_UC_FW_ENTRY, uc_fw_entry_major_ver) + }; + static const struct fw_blobs_by_type blobs_all[XE_UC_FW_NUM_TYPES] = { + [XE_UC_FW_TYPE_GUC] = { entries_guc, ARRAY_SIZE(entries_guc) }, + [XE_UC_FW_TYPE_HUC] = { entries_huc, ARRAY_SIZE(entries_huc) }, + [XE_UC_FW_TYPE_GSC] = { entries_gsc, ARRAY_SIZE(entries_gsc) }, + }; + static const struct uc_fw_entry *entries; + enum xe_platform p = xe->info.platform; + u32 count; + int i; + + xe_assert(xe, uc_fw->type < ARRAY_SIZE(blobs_all)); + entries = blobs_all[uc_fw->type].entries; + count = blobs_all[uc_fw->type].count; + + for (i = 0; i < count && p <= entries[i].platform; i++) { + if (p == entries[i].platform) { + uc_fw->path = entries[i].path; + uc_fw->versions.wanted.major = entries[i].major; + uc_fw->versions.wanted.minor = entries[i].minor; + uc_fw->full_ver_required = entries[i].full_ver_required; + + if (uc_fw->type == XE_UC_FW_TYPE_GSC) + uc_fw->versions.wanted_type = XE_UC_FW_VER_COMPATIBILITY; + else + uc_fw->versions.wanted_type = XE_UC_FW_VER_RELEASE; + + break; + } + } +} + +static void +uc_fw_override(struct xe_uc_fw *uc_fw) +{ + char *path_override = NULL; + + /* empty string disables, but it's not allowed for GuC */ + switch (uc_fw->type) { + case XE_UC_FW_TYPE_GUC: + if (xe_modparam.guc_firmware_path && *xe_modparam.guc_firmware_path) + path_override = xe_modparam.guc_firmware_path; + break; + case XE_UC_FW_TYPE_HUC: + path_override = xe_modparam.huc_firmware_path; + break; + case XE_UC_FW_TYPE_GSC: + path_override = xe_modparam.gsc_firmware_path; + break; + default: + break; + } + + if (path_override) { + uc_fw->path = path_override; + uc_fw->user_overridden = true; + } +} + +/** + * xe_uc_fw_copy_rsa - copy fw RSA to buffer + * + * @uc_fw: uC firmware + * @dst: dst buffer + * @max_len: max number of bytes to copy + * + * Return: number of copied bytes. + */ +size_t xe_uc_fw_copy_rsa(struct xe_uc_fw *uc_fw, void *dst, u32 max_len) +{ + struct xe_device *xe = uc_fw_to_xe(uc_fw); + u32 size = min_t(u32, uc_fw->rsa_size, max_len); + + xe_assert(xe, !(size % 4)); + xe_assert(xe, xe_uc_fw_is_available(uc_fw)); + + xe_map_memcpy_from(xe, dst, &uc_fw->bo->vmap, + xe_uc_fw_rsa_offset(uc_fw), size); + + return size; +} + +static void uc_fw_fini(struct drm_device *drm, void *arg) +{ + struct xe_uc_fw *uc_fw = arg; + + if (!xe_uc_fw_is_available(uc_fw)) + return; + + xe_uc_fw_change_status(uc_fw, XE_UC_FIRMWARE_SELECTED); +} + +static void guc_read_css_info(struct xe_uc_fw *uc_fw, struct uc_css_header *css) +{ + struct xe_gt *gt = uc_fw_to_gt(uc_fw); + struct xe_uc_fw_version *release = &uc_fw->versions.found[XE_UC_FW_VER_RELEASE]; + struct xe_uc_fw_version *compatibility = &uc_fw->versions.found[XE_UC_FW_VER_COMPATIBILITY]; + + xe_gt_assert(gt, uc_fw->type == XE_UC_FW_TYPE_GUC); + xe_gt_assert(gt, release->major >= 70); + + if (release->major > 70 || release->minor >= 6) { + /* v70.6.0 adds CSS header support */ + compatibility->major = FIELD_GET(CSS_SW_VERSION_UC_MAJOR, + css->submission_version); + compatibility->minor = FIELD_GET(CSS_SW_VERSION_UC_MINOR, + css->submission_version); + compatibility->patch = FIELD_GET(CSS_SW_VERSION_UC_PATCH, + css->submission_version); + } else if (release->minor >= 3) { + /* v70.3.0 introduced v1.1.0 */ + compatibility->major = 1; + compatibility->minor = 1; + compatibility->patch = 0; + } else { + /* v70.0.0 introduced v1.0.0 */ + compatibility->major = 1; + compatibility->minor = 0; + compatibility->patch = 0; + } + + uc_fw->private_data_size = css->private_data_size; +} + +int xe_uc_fw_check_version_requirements(struct xe_uc_fw *uc_fw) +{ + struct xe_device *xe = uc_fw_to_xe(uc_fw); + struct xe_uc_fw_version *wanted = &uc_fw->versions.wanted; + struct xe_uc_fw_version *found = &uc_fw->versions.found[uc_fw->versions.wanted_type]; + + /* Driver has no requirement on any version, any is good. */ + if (!wanted->major) + return 0; + + /* + * If full version is required, both major and minor should match. + * Otherwise, at least the major version. + */ + if (wanted->major != found->major || + (uc_fw->full_ver_required && wanted->minor != found->minor)) { + drm_notice(&xe->drm, "%s firmware %s: unexpected version: %u.%u != %u.%u\n", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path, + found->major, found->minor, + wanted->major, wanted->minor); + goto fail; + } + + if (wanted->minor > found->minor) { + drm_notice(&xe->drm, "%s firmware (%u.%u) is recommended, but only (%u.%u) was found in %s\n", + xe_uc_fw_type_repr(uc_fw->type), + wanted->major, wanted->minor, + found->major, found->minor, + uc_fw->path); + drm_info(&xe->drm, "Consider updating your linux-firmware pkg or downloading from %s\n", + XE_UC_FIRMWARE_URL); + } + + return 0; + +fail: + if (xe_uc_fw_is_overridden(uc_fw)) + return 0; + + return -ENOEXEC; +} + +/* Refer to the "CSS-based Firmware Layout" documentation entry for details */ +static int parse_css_header(struct xe_uc_fw *uc_fw, const void *fw_data, size_t fw_size) +{ + struct xe_device *xe = uc_fw_to_xe(uc_fw); + struct xe_uc_fw_version *release = &uc_fw->versions.found[XE_UC_FW_VER_RELEASE]; + struct uc_css_header *css; + size_t size; + + /* Check the size of the blob before examining buffer contents */ + if (unlikely(fw_size < sizeof(struct uc_css_header))) { + drm_warn(&xe->drm, "%s firmware %s: invalid size: %zu < %zu\n", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path, + fw_size, sizeof(struct uc_css_header)); + return -ENODATA; + } + + css = (struct uc_css_header *)fw_data; + + /* Check integrity of size values inside CSS header */ + size = (css->header_size_dw - css->key_size_dw - css->modulus_size_dw - + css->exponent_size_dw) * sizeof(u32); + if (unlikely(size != sizeof(struct uc_css_header))) { + drm_warn(&xe->drm, + "%s firmware %s: unexpected header size: %zu != %zu\n", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path, + fw_size, sizeof(struct uc_css_header)); + return -EPROTO; + } + + /* uCode size must calculated from other sizes */ + uc_fw->ucode_size = (css->size_dw - css->header_size_dw) * sizeof(u32); + + /* now RSA */ + uc_fw->rsa_size = css->key_size_dw * sizeof(u32); + + /* At least, it should have header, uCode and RSA. Size of all three. */ + size = sizeof(struct uc_css_header) + uc_fw->ucode_size + + uc_fw->rsa_size; + if (unlikely(fw_size < size)) { + drm_warn(&xe->drm, "%s firmware %s: invalid size: %zu < %zu\n", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path, + fw_size, size); + return -ENOEXEC; + } + + /* Get version numbers from the CSS header */ + release->major = FIELD_GET(CSS_SW_VERSION_UC_MAJOR, css->sw_version); + release->minor = FIELD_GET(CSS_SW_VERSION_UC_MINOR, css->sw_version); + release->patch = FIELD_GET(CSS_SW_VERSION_UC_PATCH, css->sw_version); + + if (uc_fw->type == XE_UC_FW_TYPE_GUC) + guc_read_css_info(uc_fw, css); + + return 0; +} + +static bool is_cpd_header(const void *data) +{ + const u32 *marker = data; + + return *marker == GSC_CPD_HEADER_MARKER; +} + +static u32 entry_offset(const struct gsc_cpd_header_v2 *header, const char *name) +{ + const struct gsc_cpd_entry *entry; + int i; + + entry = (void *)header + header->header_length; + + for (i = 0; i < header->num_of_entries; i++, entry++) + if (strcmp(entry->name, name) == 0) + return entry->offset & GSC_CPD_ENTRY_OFFSET_MASK; + + return 0; +} + +/* Refer to the "GSC-based Firmware Layout" documentation entry for details */ +static int parse_cpd_header(struct xe_uc_fw *uc_fw, const void *data, size_t size, + const char *manifest_entry, const char *css_entry) +{ + struct xe_gt *gt = uc_fw_to_gt(uc_fw); + struct xe_device *xe = gt_to_xe(gt); + const struct gsc_cpd_header_v2 *header = data; + struct xe_uc_fw_version *release = &uc_fw->versions.found[XE_UC_FW_VER_RELEASE]; + const struct gsc_manifest_header *manifest; + size_t min_size = sizeof(*header); + u32 offset; + + /* manifest_entry is mandatory, css_entry is optional */ + xe_assert(xe, manifest_entry); + + if (size < min_size || !is_cpd_header(header)) + return -ENOENT; + + if (header->header_length < sizeof(struct gsc_cpd_header_v2)) { + xe_gt_err(gt, "invalid CPD header length %u!\n", header->header_length); + return -EINVAL; + } + + min_size = header->header_length + sizeof(struct gsc_cpd_entry) * header->num_of_entries; + if (size < min_size) { + xe_gt_err(gt, "FW too small! %zu < %zu\n", size, min_size); + return -ENODATA; + } + + /* Look for the manifest first */ + offset = entry_offset(header, manifest_entry); + if (!offset) { + xe_gt_err(gt, "Failed to find %s manifest!\n", + xe_uc_fw_type_repr(uc_fw->type)); + return -ENODATA; + } + + min_size = offset + sizeof(struct gsc_manifest_header); + if (size < min_size) { + xe_gt_err(gt, "FW too small! %zu < %zu\n", size, min_size); + return -ENODATA; + } + + manifest = data + offset; + + release->major = manifest->fw_version.major; + release->minor = manifest->fw_version.minor; + release->patch = manifest->fw_version.hotfix; + + if (uc_fw->type == XE_UC_FW_TYPE_GSC) { + struct xe_gsc *gsc = container_of(uc_fw, struct xe_gsc, fw); + + release->build = manifest->fw_version.build; + gsc->security_version = manifest->security_version; + } + + /* then optionally look for the css header */ + if (css_entry) { + int ret; + + /* + * This section does not contain a CSS entry on DG2. We + * don't support DG2 HuC right now, so no need to handle + * it, just add a reminder in case that changes. + */ + xe_assert(xe, xe->info.platform != XE_DG2); + + offset = entry_offset(header, css_entry); + + /* the CSS header parser will check that the CSS header fits */ + if (offset > size) { + xe_gt_err(gt, "FW too small! %zu < %u\n", size, offset); + return -ENODATA; + } + + ret = parse_css_header(uc_fw, data + offset, size - offset); + if (ret) + return ret; + + uc_fw->css_offset = offset; + } + + uc_fw->has_gsc_headers = true; + + return 0; +} + +static int parse_gsc_layout(struct xe_uc_fw *uc_fw, const void *data, size_t size) +{ + struct xe_gt *gt = uc_fw_to_gt(uc_fw); + const struct gsc_layout_pointers *layout = data; + const struct gsc_bpdt_header *bpdt_header = NULL; + const struct gsc_bpdt_entry *bpdt_entry = NULL; + size_t min_size = sizeof(*layout); + int i; + + if (size < min_size) { + xe_gt_err(gt, "GSC FW too small! %zu < %zu\n", size, min_size); + return -ENODATA; + } + + min_size = layout->boot1.offset + layout->boot1.size; + if (size < min_size) { + xe_gt_err(gt, "GSC FW too small for boot section! %zu < %zu\n", + size, min_size); + return -ENODATA; + } + + min_size = sizeof(*bpdt_header); + if (layout->boot1.size < min_size) { + xe_gt_err(gt, "GSC FW boot section too small for BPDT header: %u < %zu\n", + layout->boot1.size, min_size); + return -ENODATA; + } + + bpdt_header = data + layout->boot1.offset; + if (bpdt_header->signature != GSC_BPDT_HEADER_SIGNATURE) { + xe_gt_err(gt, "invalid signature for BPDT header: 0x%08x!\n", + bpdt_header->signature); + return -EINVAL; + } + + min_size += sizeof(*bpdt_entry) * bpdt_header->descriptor_count; + if (layout->boot1.size < min_size) { + xe_gt_err(gt, "GSC FW boot section too small for BPDT entries: %u < %zu\n", + layout->boot1.size, min_size); + return -ENODATA; + } + + bpdt_entry = (void *)bpdt_header + sizeof(*bpdt_header); + for (i = 0; i < bpdt_header->descriptor_count; i++, bpdt_entry++) { + if ((bpdt_entry->type & GSC_BPDT_ENTRY_TYPE_MASK) != + GSC_BPDT_ENTRY_TYPE_GSC_RBE) + continue; + + min_size = bpdt_entry->sub_partition_offset; + + /* the CPD header parser will check that the CPD header fits */ + if (layout->boot1.size < min_size) { + xe_gt_err(gt, "GSC FW boot section too small for CPD offset: %u < %zu\n", + layout->boot1.size, min_size); + return -ENODATA; + } + + return parse_cpd_header(uc_fw, + (void *)bpdt_header + min_size, + layout->boot1.size - min_size, + "RBEP.man", NULL); + } + + xe_gt_err(gt, "couldn't find CPD header in GSC binary!\n"); + return -ENODATA; +} + +static int parse_headers(struct xe_uc_fw *uc_fw, const struct firmware *fw) +{ + int ret; + + /* + * All GuC releases and older HuC ones use CSS headers, while newer HuC + * releases use GSC CPD headers. + */ + switch (uc_fw->type) { + case XE_UC_FW_TYPE_GSC: + return parse_gsc_layout(uc_fw, fw->data, fw->size); + case XE_UC_FW_TYPE_HUC: + ret = parse_cpd_header(uc_fw, fw->data, fw->size, "HUCP.man", "huc_fw"); + if (!ret || ret != -ENOENT) + return ret; + fallthrough; + case XE_UC_FW_TYPE_GUC: + return parse_css_header(uc_fw, fw->data, fw->size); + default: + return -EINVAL; + } + + return 0; +} + +#define print_uc_fw_version(p_, version_, prefix_, ...) \ +do { \ + struct xe_uc_fw_version *ver_ = (version_); \ + if (ver_->build) \ + drm_printf(p_, prefix_ " version %u.%u.%u.%u\n", ##__VA_ARGS__, \ + ver_->major, ver_->minor, \ + ver_->patch, ver_->build); \ + else \ + drm_printf(p_, prefix_ " version %u.%u.%u\n", ##__VA_ARGS__, \ + ver_->major, ver_->minor, ver_->patch); \ +} while (0) + +static int uc_fw_request(struct xe_uc_fw *uc_fw, const struct firmware **firmware_p) +{ + struct xe_device *xe = uc_fw_to_xe(uc_fw); + struct device *dev = xe->drm.dev; + struct drm_printer p = drm_info_printer(dev); + const struct firmware *fw = NULL; + int err; + + /* + * we use FIRMWARE_UNINITIALIZED to detect checks against uc_fw->status + * before we're looked at the HW caps to see if we have uc support + */ + BUILD_BUG_ON(XE_UC_FIRMWARE_UNINITIALIZED); + xe_assert(xe, !uc_fw->status); + xe_assert(xe, !uc_fw->path); + + uc_fw_auto_select(xe, uc_fw); + xe_uc_fw_change_status(uc_fw, uc_fw->path ? + XE_UC_FIRMWARE_SELECTED : + XE_UC_FIRMWARE_NOT_SUPPORTED); + + if (!xe_uc_fw_is_supported(uc_fw)) + return 0; + + uc_fw_override(uc_fw); + + /* an empty path means the firmware is disabled */ + if (!xe_device_uc_enabled(xe) || !(*uc_fw->path)) { + xe_uc_fw_change_status(uc_fw, XE_UC_FIRMWARE_DISABLED); + drm_dbg(&xe->drm, "%s disabled", xe_uc_fw_type_repr(uc_fw->type)); + return 0; + } + + err = request_firmware(&fw, uc_fw->path, dev); + if (err) + goto fail; + + err = parse_headers(uc_fw, fw); + if (err) + goto fail; + + print_uc_fw_version(&p, + &uc_fw->versions.found[XE_UC_FW_VER_RELEASE], + "Using %s firmware from %s", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path); + + /* for GSC FW we want the compatibility version, which we query after load */ + if (uc_fw->type != XE_UC_FW_TYPE_GSC) { + err = xe_uc_fw_check_version_requirements(uc_fw); + if (err) + goto fail; + } + + *firmware_p = fw; + + return 0; + +fail: + xe_uc_fw_change_status(uc_fw, err == -ENOENT ? + XE_UC_FIRMWARE_MISSING : + XE_UC_FIRMWARE_ERROR); + + drm_notice(&xe->drm, "%s firmware %s: fetch failed with error %d\n", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path, err); + drm_info(&xe->drm, "%s firmware(s) can be downloaded from %s\n", + xe_uc_fw_type_repr(uc_fw->type), XE_UC_FIRMWARE_URL); + + release_firmware(fw); /* OK even if fw is NULL */ + + return err; +} + +static void uc_fw_release(const struct firmware *fw) +{ + release_firmware(fw); +} + +static int uc_fw_copy(struct xe_uc_fw *uc_fw, const void *data, size_t size, u32 flags) +{ + struct xe_device *xe = uc_fw_to_xe(uc_fw); + struct xe_gt *gt = uc_fw_to_gt(uc_fw); + struct xe_tile *tile = gt_to_tile(gt); + struct xe_bo *obj; + int err; + + obj = xe_managed_bo_create_from_data(xe, tile, data, size, flags); + if (IS_ERR(obj)) { + drm_notice(&xe->drm, "%s firmware %s: failed to create / populate bo", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path); + err = PTR_ERR(obj); + goto fail; + } + + uc_fw->bo = obj; + uc_fw->size = size; + + xe_uc_fw_change_status(uc_fw, XE_UC_FIRMWARE_AVAILABLE); + + err = drmm_add_action_or_reset(&xe->drm, uc_fw_fini, uc_fw); + if (err) + goto fail; + + return 0; + +fail: + xe_uc_fw_change_status(uc_fw, XE_UC_FIRMWARE_ERROR); + drm_notice(&xe->drm, "%s firmware %s: copy failed with error %d\n", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path, err); + + return err; +} + +int xe_uc_fw_init(struct xe_uc_fw *uc_fw) +{ + const struct firmware *fw = NULL; + int err; + + err = uc_fw_request(uc_fw, &fw); + if (err) + return err; + + /* no error and no firmware means nothing to copy */ + if (!fw) + return 0; + + err = uc_fw_copy(uc_fw, fw->data, fw->size, + XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_GGTT_BIT); + + uc_fw_release(fw); + + return err; +} + +static u32 uc_fw_ggtt_offset(struct xe_uc_fw *uc_fw) +{ + return xe_bo_ggtt_addr(uc_fw->bo); +} + +static int uc_fw_xfer(struct xe_uc_fw *uc_fw, u32 offset, u32 dma_flags) +{ + struct xe_device *xe = uc_fw_to_xe(uc_fw); + struct xe_gt *gt = uc_fw_to_gt(uc_fw); + u32 src_offset, dma_ctrl; + int ret; + + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + + /* Set the source address for the uCode */ + src_offset = uc_fw_ggtt_offset(uc_fw) + uc_fw->css_offset; + xe_mmio_write32(gt, DMA_ADDR_0_LOW, lower_32_bits(src_offset)); + xe_mmio_write32(gt, DMA_ADDR_0_HIGH, + upper_32_bits(src_offset) | DMA_ADDRESS_SPACE_GGTT); + + /* Set the DMA destination */ + xe_mmio_write32(gt, DMA_ADDR_1_LOW, offset); + xe_mmio_write32(gt, DMA_ADDR_1_HIGH, DMA_ADDRESS_SPACE_WOPCM); + + /* + * Set the transfer size. The header plus uCode will be copied to WOPCM + * via DMA, excluding any other components + */ + xe_mmio_write32(gt, DMA_COPY_SIZE, + sizeof(struct uc_css_header) + uc_fw->ucode_size); + + /* Start the DMA */ + xe_mmio_write32(gt, DMA_CTRL, + _MASKED_BIT_ENABLE(dma_flags | START_DMA)); + + /* Wait for DMA to finish */ + ret = xe_mmio_wait32(gt, DMA_CTRL, START_DMA, 0, 100000, &dma_ctrl, + false); + if (ret) + drm_err(&xe->drm, "DMA for %s fw failed, DMA_CTRL=%u\n", + xe_uc_fw_type_repr(uc_fw->type), dma_ctrl); + + /* Disable the bits once DMA is over */ + xe_mmio_write32(gt, DMA_CTRL, _MASKED_BIT_DISABLE(dma_flags)); + + return ret; +} + +int xe_uc_fw_upload(struct xe_uc_fw *uc_fw, u32 offset, u32 dma_flags) +{ + struct xe_device *xe = uc_fw_to_xe(uc_fw); + int err; + + /* make sure the status was cleared the last time we reset the uc */ + xe_assert(xe, !xe_uc_fw_is_loaded(uc_fw)); + + if (!xe_uc_fw_is_loadable(uc_fw)) + return -ENOEXEC; + + /* Call custom loader */ + err = uc_fw_xfer(uc_fw, offset, dma_flags); + if (err) + goto fail; + + xe_uc_fw_change_status(uc_fw, XE_UC_FIRMWARE_TRANSFERRED); + return 0; + +fail: + drm_err(&xe->drm, "Failed to load %s firmware %s (%d)\n", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path, + err); + xe_uc_fw_change_status(uc_fw, XE_UC_FIRMWARE_LOAD_FAIL); + return err; +} + +static const char *version_type_repr(enum xe_uc_fw_version_types type) +{ + switch (type) { + case XE_UC_FW_VER_RELEASE: + return "release"; + case XE_UC_FW_VER_COMPATIBILITY: + return "compatibility"; + default: + return "Unknown version type"; + } +} + +void xe_uc_fw_print(struct xe_uc_fw *uc_fw, struct drm_printer *p) +{ + int i; + + drm_printf(p, "%s firmware: %s\n", + xe_uc_fw_type_repr(uc_fw->type), uc_fw->path); + drm_printf(p, "\tstatus: %s\n", + xe_uc_fw_status_repr(uc_fw->status)); + + print_uc_fw_version(p, &uc_fw->versions.wanted, "\twanted %s", + version_type_repr(uc_fw->versions.wanted_type)); + + for (i = 0; i < XE_UC_FW_VER_TYPE_COUNT; i++) { + struct xe_uc_fw_version *ver = &uc_fw->versions.found[i]; + + if (ver->major) + print_uc_fw_version(p, ver, "\tfound %s", + version_type_repr(i)); + } + + if (uc_fw->ucode_size) + drm_printf(p, "\tuCode: %u bytes\n", uc_fw->ucode_size); + if (uc_fw->rsa_size) + drm_printf(p, "\tRSA: %u bytes\n", uc_fw->rsa_size); +} diff --git a/drivers/gpu/drm/xe/xe_uc_fw.h b/drivers/gpu/drm/xe/xe_uc_fw.h new file mode 100644 index 000000000000..85c20795d1f8 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_uc_fw.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_UC_FW_H_ +#define _XE_UC_FW_H_ + +#include <linux/errno.h> + +#include "xe_macros.h" +#include "xe_uc_fw_abi.h" +#include "xe_uc_fw_types.h" + +struct drm_printer; + +int xe_uc_fw_init(struct xe_uc_fw *uc_fw); +size_t xe_uc_fw_copy_rsa(struct xe_uc_fw *uc_fw, void *dst, u32 max_len); +int xe_uc_fw_upload(struct xe_uc_fw *uc_fw, u32 offset, u32 dma_flags); +int xe_uc_fw_check_version_requirements(struct xe_uc_fw *uc_fw); +void xe_uc_fw_print(struct xe_uc_fw *uc_fw, struct drm_printer *p); + +static inline u32 xe_uc_fw_rsa_offset(struct xe_uc_fw *uc_fw) +{ + return sizeof(struct uc_css_header) + uc_fw->ucode_size + uc_fw->css_offset; +} + +static inline void xe_uc_fw_change_status(struct xe_uc_fw *uc_fw, + enum xe_uc_fw_status status) +{ + uc_fw->__status = status; +} + +static inline +const char *xe_uc_fw_status_repr(enum xe_uc_fw_status status) +{ + switch (status) { + case XE_UC_FIRMWARE_NOT_SUPPORTED: + return "N/A"; + case XE_UC_FIRMWARE_UNINITIALIZED: + return "UNINITIALIZED"; + case XE_UC_FIRMWARE_DISABLED: + return "DISABLED"; + case XE_UC_FIRMWARE_SELECTED: + return "SELECTED"; + case XE_UC_FIRMWARE_MISSING: + return "MISSING"; + case XE_UC_FIRMWARE_ERROR: + return "ERROR"; + case XE_UC_FIRMWARE_AVAILABLE: + return "AVAILABLE"; + case XE_UC_FIRMWARE_INIT_FAIL: + return "INIT FAIL"; + case XE_UC_FIRMWARE_LOADABLE: + return "LOADABLE"; + case XE_UC_FIRMWARE_LOAD_FAIL: + return "LOAD FAIL"; + case XE_UC_FIRMWARE_TRANSFERRED: + return "TRANSFERRED"; + case XE_UC_FIRMWARE_RUNNING: + return "RUNNING"; + } + return "<invalid>"; +} + +static inline int xe_uc_fw_status_to_error(enum xe_uc_fw_status status) +{ + switch (status) { + case XE_UC_FIRMWARE_NOT_SUPPORTED: + return -ENODEV; + case XE_UC_FIRMWARE_UNINITIALIZED: + return -EACCES; + case XE_UC_FIRMWARE_DISABLED: + return -EPERM; + case XE_UC_FIRMWARE_MISSING: + return -ENOENT; + case XE_UC_FIRMWARE_ERROR: + return -ENOEXEC; + case XE_UC_FIRMWARE_INIT_FAIL: + case XE_UC_FIRMWARE_LOAD_FAIL: + return -EIO; + case XE_UC_FIRMWARE_SELECTED: + return -ESTALE; + case XE_UC_FIRMWARE_AVAILABLE: + case XE_UC_FIRMWARE_LOADABLE: + case XE_UC_FIRMWARE_TRANSFERRED: + case XE_UC_FIRMWARE_RUNNING: + return 0; + } + return -EINVAL; +} + +static inline const char *xe_uc_fw_type_repr(enum xe_uc_fw_type type) +{ + switch (type) { + case XE_UC_FW_TYPE_GUC: + return "GuC"; + case XE_UC_FW_TYPE_HUC: + return "HuC"; + case XE_UC_FW_TYPE_GSC: + return "GSC"; + default: + return "uC"; + } +} + +static inline enum xe_uc_fw_status +__xe_uc_fw_status(struct xe_uc_fw *uc_fw) +{ + /* shouldn't call this before checking hw/blob availability */ + XE_WARN_ON(uc_fw->status == XE_UC_FIRMWARE_UNINITIALIZED); + return uc_fw->status; +} + +static inline bool xe_uc_fw_is_supported(struct xe_uc_fw *uc_fw) +{ + return __xe_uc_fw_status(uc_fw) != XE_UC_FIRMWARE_NOT_SUPPORTED; +} + +static inline bool xe_uc_fw_is_enabled(struct xe_uc_fw *uc_fw) +{ + return __xe_uc_fw_status(uc_fw) > XE_UC_FIRMWARE_DISABLED; +} + +static inline bool xe_uc_fw_is_disabled(struct xe_uc_fw *uc_fw) +{ + return __xe_uc_fw_status(uc_fw) == XE_UC_FIRMWARE_DISABLED; +} + +static inline bool xe_uc_fw_is_available(struct xe_uc_fw *uc_fw) +{ + return __xe_uc_fw_status(uc_fw) >= XE_UC_FIRMWARE_AVAILABLE; +} + +static inline bool xe_uc_fw_is_loadable(struct xe_uc_fw *uc_fw) +{ + return __xe_uc_fw_status(uc_fw) >= XE_UC_FIRMWARE_LOADABLE; +} + +static inline bool xe_uc_fw_is_loaded(struct xe_uc_fw *uc_fw) +{ + return __xe_uc_fw_status(uc_fw) >= XE_UC_FIRMWARE_TRANSFERRED; +} + +static inline bool xe_uc_fw_is_running(struct xe_uc_fw *uc_fw) +{ + return __xe_uc_fw_status(uc_fw) == XE_UC_FIRMWARE_RUNNING; +} + +static inline bool xe_uc_fw_is_overridden(const struct xe_uc_fw *uc_fw) +{ + return uc_fw->user_overridden; +} + +static inline void xe_uc_fw_sanitize(struct xe_uc_fw *uc_fw) +{ + if (xe_uc_fw_is_loaded(uc_fw)) + xe_uc_fw_change_status(uc_fw, XE_UC_FIRMWARE_LOADABLE); +} + +static inline u32 __xe_uc_fw_get_upload_size(struct xe_uc_fw *uc_fw) +{ + return sizeof(struct uc_css_header) + uc_fw->ucode_size; +} + +/** + * xe_uc_fw_get_upload_size() - Get size of firmware needed to be uploaded. + * @uc_fw: uC firmware. + * + * Get the size of the firmware and header that will be uploaded to WOPCM. + * + * Return: Upload firmware size, or zero on firmware fetch failure. + */ +static inline u32 xe_uc_fw_get_upload_size(struct xe_uc_fw *uc_fw) +{ + if (!xe_uc_fw_is_available(uc_fw)) + return 0; + + return __xe_uc_fw_get_upload_size(uc_fw); +} + +#define XE_UC_FIRMWARE_URL "https://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git" + +#endif diff --git a/drivers/gpu/drm/xe/xe_uc_fw_abi.h b/drivers/gpu/drm/xe/xe_uc_fw_abi.h new file mode 100644 index 000000000000..87ade41209d0 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_uc_fw_abi.h @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_UC_FW_ABI_H +#define _XE_UC_FW_ABI_H + +#include <linux/build_bug.h> +#include <linux/types.h> + +/** + * DOC: CSS-based Firmware Layout + * + * The CSS-based firmware structure is used for GuC releases on all platforms + * and for HuC releases up to DG1. Starting from DG2/MTL the HuC uses the GSC + * layout instead. + * The CSS firmware layout looks like this:: + * + * +======================================================================+ + * | Firmware blob | + * +===============+===============+============+============+============+ + * | CSS header | uCode | RSA key | modulus | exponent | + * +===============+===============+============+============+============+ + * <-header size-> <---header size continued -----------> + * <--- size -----------------------------------------------------------> + * <-key size-> + * <-mod size-> + * <-exp size-> + * + * The firmware may or may not have modulus key and exponent data. The header, + * uCode and RSA signature are must-have components that will be used by driver. + * Length of each components, which is all in dwords, can be found in header. + * In the case that modulus and exponent are not present in fw, a.k.a truncated + * image, the length value still appears in header. + * + * Driver will do some basic fw size validation based on the following rules: + * + * 1. Header, uCode and RSA are must-have components. + * 2. All firmware components, if they present, are in the sequence illustrated + * in the layout table above. + * 3. Length info of each component can be found in header, in dwords. + * 4. Modulus and exponent key are not required by driver. They may not appear + * in fw. So driver will load a truncated firmware in this case. + */ + +struct uc_css_header { + u32 module_type; + /* + * header_size includes all non-uCode bits, including css_header, rsa + * key, modulus key and exponent data. + */ + u32 header_size_dw; + u32 header_version; + u32 module_id; + u32 module_vendor; + u32 date; +#define CSS_DATE_DAY (0xFF << 0) +#define CSS_DATE_MONTH (0xFF << 8) +#define CSS_DATE_YEAR (0xFFFF << 16) + u32 size_dw; /* uCode plus header_size_dw */ + u32 key_size_dw; + u32 modulus_size_dw; + u32 exponent_size_dw; + u32 time; +#define CSS_TIME_HOUR (0xFF << 0) +#define CSS_DATE_MIN (0xFF << 8) +#define CSS_DATE_SEC (0xFFFF << 16) + char username[8]; + char buildnumber[12]; + u32 sw_version; +#define CSS_SW_VERSION_UC_MAJOR (0xFF << 16) +#define CSS_SW_VERSION_UC_MINOR (0xFF << 8) +#define CSS_SW_VERSION_UC_PATCH (0xFF << 0) + union { + u32 submission_version; /* only applies to GuC */ + u32 reserved2; + }; + u32 reserved0[12]; + union { + u32 private_data_size; /* only applies to GuC */ + u32 reserved1; + }; + u32 header_info; +} __packed; +static_assert(sizeof(struct uc_css_header) == 128); + +/** + * DOC: GSC-based Firmware Layout + * + * The GSC-based firmware structure is used for GSC releases on all platforms + * and for HuC releases starting from DG2/MTL. Older HuC releases use the + * CSS-based layout instead. Differently from the CSS headers, the GSC headers + * uses a directory + entries structure (i.e., there is array of addresses + * pointing to specific header extensions identified by a name). Although the + * header structures are the same, some of the entries are specific to GSC while + * others are specific to HuC. The manifest header entry, which includes basic + * information about the binary (like the version) is always present, but it is + * named differently based on the binary type. + * + * The HuC binary starts with a Code Partition Directory (CPD) header. The + * entries we're interested in for use in the driver are: + * + * 1. "HUCP.man": points to the manifest header for the HuC. + * 2. "huc_fw": points to the FW code. On platforms that support load via DMA + * and 2-step HuC authentication (i.e. MTL+) this is a full CSS-based binary, + * while if the GSC is the one doing the load (which only happens on DG2) + * this section only contains the uCode. + * + * The GSC-based HuC firmware layout looks like this:: + * + * +================================================+ + * | CPD Header | + * +================================================+ + * | CPD entries[] | + * | entry1 | + * | ... | + * | entryX | + * | "HUCP.man" | + * | ... | + * | offset >----------------------------|------o + * | ... | | + * | entryY | | + * | "huc_fw" | | + * | ... | | + * | offset >----------------------------|----------o + * +================================================+ | | + * | | + * +================================================+ | | + * | Manifest Header |<-----o | + * | ... | | + * | FW version | | + * | ... | | + * +================================================+ | + * | + * +================================================+ | + * | FW binary |<---------o + * | CSS (MTL+ only) | + * | uCode | + * | RSA Key (MTL+ only) | + * | ... | + * +================================================+ + * + * The GSC binary starts instead with a layout header, which contains the + * locations of the various partitions of the binary. The one we're interested + * in is the boot1 partition, where we can find a BPDT header followed by + * entries, one of which points to the RBE sub-section of the partition, which + * contains the CPD. The GSC blob does not contain a CSS-based binary, so we + * only need to look for the manifest, which is under the "RBEP.man" CPD entry. + * Note that we have no need to find where the actual FW code is inside the + * image because the GSC ROM will itself parse the headers to find it and load + * it. + * The GSC firmware header layout looks like this:: + * + * +================================================+ + * | Layout Pointers | + * | ... | + * | Boot1 offset >---------------------------|------o + * | ... | | + * +================================================+ | + * | + * +================================================+ | + * | BPDT header |<-----o + * +================================================+ + * | BPDT entries[] | + * | entry1 | + * | ... | + * | entryX | + * | type == GSC_RBE | + * | offset >-----------------------------|------o + * | ... | | + * +================================================+ | + * | + * +================================================+ | + * | CPD Header |<-----o + * +================================================+ + * | CPD entries[] | + * | entry1 | + * | ... | + * | entryX | + * | "RBEP.man" | + * | ... | + * | offset >----------------------------|------o + * | ... | | + * +================================================+ | + * | + * +================================================+ | + * | Manifest Header |<-----o + * | ... | + * | FW version | + * | ... | + * | Security version | + * | ... | + * +================================================+ + */ + +struct gsc_version { + u16 major; + u16 minor; + u16 hotfix; + u16 build; +} __packed; + +struct gsc_partition { + u32 offset; + u32 size; +} __packed; + +struct gsc_layout_pointers { + u8 rom_bypass_vector[16]; + + /* size of this header section, not including ROM bypass vector */ + u16 size; + + /* + * bit0: Backup copy of layout pointers exists + * bits1-15: reserved + */ + u8 flags; + + u8 reserved; + + u32 crc32; + + struct gsc_partition datap; + struct gsc_partition boot1; + struct gsc_partition boot2; + struct gsc_partition boot3; + struct gsc_partition boot4; + struct gsc_partition boot5; + struct gsc_partition temp_pages; +} __packed; + +/* Boot partition structures */ +struct gsc_bpdt_header { + u32 signature; +#define GSC_BPDT_HEADER_SIGNATURE 0x000055AA + + u16 descriptor_count; /* num of entries after the header */ + + u8 version; + u8 configuration; + + u32 crc32; + + u32 build_version; + struct gsc_version tool_version; +} __packed; + +struct gsc_bpdt_entry { + /* + * Bits 0-15: BPDT entry type + * Bits 16-17: reserved + * Bit 18: code sub-partition + * Bits 19-31: reserved + */ + u32 type; +#define GSC_BPDT_ENTRY_TYPE_MASK GENMASK(15, 0) +#define GSC_BPDT_ENTRY_TYPE_GSC_RBE 0x1 + + u32 sub_partition_offset; /* from the base of the BPDT header */ + u32 sub_partition_size; +} __packed; + +/* Code partition directory (CPD) structures */ +struct gsc_cpd_header_v2 { + u32 header_marker; +#define GSC_CPD_HEADER_MARKER 0x44504324 + + u32 num_of_entries; + u8 header_version; + u8 entry_version; + u8 header_length; /* in bytes */ + u8 flags; + u32 partition_name; + u32 crc32; +} __packed; + +struct gsc_cpd_entry { + u8 name[12]; + + /* + * Bits 0-24: offset from the beginning of the code partition + * Bit 25: huffman compressed + * Bits 26-31: reserved + */ + u32 offset; +#define GSC_CPD_ENTRY_OFFSET_MASK GENMASK(24, 0) +#define GSC_CPD_ENTRY_HUFFMAN_COMP BIT(25) + + /* + * Module/Item length, in bytes. For Huffman-compressed modules, this + * refers to the uncompressed size. For software-compressed modules, + * this refers to the compressed size. + */ + u32 length; + + u8 reserved[4]; +} __packed; + +struct gsc_manifest_header { + u32 header_type; /* 0x4 for manifest type */ + u32 header_length; /* in dwords */ + u32 header_version; + u32 flags; + u32 vendor; + u32 date; + u32 size; /* In dwords, size of entire manifest (header + extensions) */ + u32 header_id; + u32 internal_data; + struct gsc_version fw_version; + u32 security_version; + struct gsc_version meu_kit_version; + u32 meu_manifest_version; + u8 general_data[4]; + u8 reserved3[56]; + u32 modulus_size; /* in dwords */ + u32 exponent_size; /* in dwords */ +} __packed; + +#endif diff --git a/drivers/gpu/drm/xe/xe_uc_fw_types.h b/drivers/gpu/drm/xe/xe_uc_fw_types.h new file mode 100644 index 000000000000..ee914a5d8523 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_uc_fw_types.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_UC_FW_TYPES_H_ +#define _XE_UC_FW_TYPES_H_ + +#include <linux/types.h> + +struct xe_bo; + +/* + * +------------+---------------------------------------------------+ + * | PHASE | FIRMWARE STATUS TRANSITIONS | + * +============+===================================================+ + * | | UNINITIALIZED | + * +------------+- / | \ -+ + * | | DISABLED <--/ | \--> NOT_SUPPORTED | + * | init_early | V | + * | | SELECTED | + * +------------+- / | \ -+ + * | | MISSING <--/ | \--> ERROR | + * | fetch | V | + * | | AVAILABLE | + * +------------+- | \ -+ + * | | | \--> INIT FAIL | + * | init | V | + * | | /------> LOADABLE <----<-----------\ | + * +------------+- \ / \ \ \ -+ + * | | LOAD FAIL <--< \--> TRANSFERRED \ | + * | upload | \ / \ / | + * | | \---------/ \--> RUNNING | + * +------------+---------------------------------------------------+ + */ + +/* + * FIXME: Ported from the i915 and this is state machine is way too complicated. + * Circle back and simplify this. + */ +enum xe_uc_fw_status { + XE_UC_FIRMWARE_NOT_SUPPORTED = -1, /* no uc HW */ + XE_UC_FIRMWARE_UNINITIALIZED = 0, /* used to catch checks done too early */ + XE_UC_FIRMWARE_DISABLED, /* disabled */ + XE_UC_FIRMWARE_SELECTED, /* selected the blob we want to load */ + XE_UC_FIRMWARE_MISSING, /* blob not found on the system */ + XE_UC_FIRMWARE_ERROR, /* invalid format or version */ + XE_UC_FIRMWARE_AVAILABLE, /* blob found and copied in mem */ + XE_UC_FIRMWARE_INIT_FAIL, /* failed to prepare fw objects for load */ + XE_UC_FIRMWARE_LOADABLE, /* all fw-required objects are ready */ + XE_UC_FIRMWARE_LOAD_FAIL, /* failed to xfer or init/auth the fw */ + XE_UC_FIRMWARE_TRANSFERRED, /* dma xfer done */ + XE_UC_FIRMWARE_RUNNING /* init/auth done */ +}; + +enum xe_uc_fw_type { + XE_UC_FW_TYPE_GUC = 0, + XE_UC_FW_TYPE_HUC, + XE_UC_FW_TYPE_GSC, + XE_UC_FW_NUM_TYPES +}; + +/** + * struct xe_uc_fw_version - Version for XE micro controller firmware + */ +struct xe_uc_fw_version { + /** @major: major version of the FW */ + u16 major; + /** @minor: minor version of the FW */ + u16 minor; + /** @patch: patch version of the FW */ + u16 patch; + /** @build: build version of the FW (not always available) */ + u16 build; +}; + +enum xe_uc_fw_version_types { + XE_UC_FW_VER_RELEASE, + XE_UC_FW_VER_COMPATIBILITY, + XE_UC_FW_VER_TYPE_COUNT +}; + +/** + * struct xe_uc_fw - XE micro controller firmware + */ +struct xe_uc_fw { + /** @type: type uC firmware */ + enum xe_uc_fw_type type; + union { + /** @status: firmware load status */ + const enum xe_uc_fw_status status; + /** + * @__status: private firmware load status - only to be used + * by firmware laoding code + */ + enum xe_uc_fw_status __status; + }; + /** @path: path to uC firmware */ + const char *path; + /** @user_overridden: user provided path to uC firmware via modparam */ + bool user_overridden; + /** + * @full_ver_required: driver still under development and not ready + * for backward-compatible firmware. To be used only for **new** + * platforms, i.e. still under require_force_probe protection and not + * supported by i915. + */ + bool full_ver_required; + /** @size: size of uC firmware including css header */ + size_t size; + + /** @bo: XE BO for uC firmware */ + struct xe_bo *bo; + + /** @has_gsc_headers: whether the FW image starts with GSC headers */ + bool has_gsc_headers; + + /* + * The firmware build process will generate a version header file with + * major and minor version defined. The versions are built into CSS + * header of firmware. The xe kernel driver set the minimal firmware + * version required per platform. + */ + + /** @versions: FW versions wanted and found */ + struct { + /** @wanted: firmware version wanted by platform */ + struct xe_uc_fw_version wanted; + /** @wanted_type: type of firmware version wanted (release vs compatibility) */ + enum xe_uc_fw_version_types wanted_type; + /** @found: fw versions found in firmware blob */ + struct xe_uc_fw_version found[XE_UC_FW_VER_TYPE_COUNT]; + } versions; + + /** @rsa_size: RSA size */ + u32 rsa_size; + /** @ucode_size: micro kernel size */ + u32 ucode_size; + /** @css_offset: offset within the blob at which the CSS is located */ + u32 css_offset; + + /** @private_data_size: size of private data found in uC css header */ + u32 private_data_size; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_uc_types.h b/drivers/gpu/drm/xe/xe_uc_types.h new file mode 100644 index 000000000000..9924e4484866 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_uc_types.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_UC_TYPES_H_ +#define _XE_UC_TYPES_H_ + +#include "xe_gsc_types.h" +#include "xe_guc_types.h" +#include "xe_huc_types.h" +#include "xe_wopcm_types.h" + +/** + * struct xe_uc - XE micro controllers + */ +struct xe_uc { + /** @guc: Graphics micro controller */ + struct xe_guc guc; + /** @huc: HuC */ + struct xe_huc huc; + /** @gsc: Graphics Security Controller */ + struct xe_gsc gsc; + /** @wopcm: WOPCM */ + struct xe_wopcm wopcm; +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c new file mode 100644 index 000000000000..0cfe7289b97e --- /dev/null +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -0,0 +1,3209 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include "xe_vm.h" + +#include <linux/dma-fence-array.h> +#include <linux/nospec.h> + +#include <drm/drm_exec.h> +#include <drm/drm_print.h> +#include <drm/ttm/ttm_execbuf_util.h> +#include <drm/ttm/ttm_tt.h> +#include <drm/xe_drm.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/mm.h> +#include <linux/swap.h> + +#include "xe_assert.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_drm_client.h" +#include "xe_exec_queue.h" +#include "xe_gt.h" +#include "xe_gt_pagefault.h" +#include "xe_gt_tlb_invalidation.h" +#include "xe_migrate.h" +#include "xe_pat.h" +#include "xe_pm.h" +#include "xe_preempt_fence.h" +#include "xe_pt.h" +#include "xe_res_cursor.h" +#include "xe_sync.h" +#include "xe_trace.h" +#include "generated/xe_wa_oob.h" +#include "xe_wa.h" + +#define TEST_VM_ASYNC_OPS_ERROR + +static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm) +{ + return vm->gpuvm.r_obj; +} + +/** + * xe_vma_userptr_check_repin() - Advisory check for repin needed + * @vma: The userptr vma + * + * Check if the userptr vma has been invalidated since last successful + * repin. The check is advisory only and can the function can be called + * without the vm->userptr.notifier_lock held. There is no guarantee that the + * vma userptr will remain valid after a lockless check, so typically + * the call needs to be followed by a proper check under the notifier_lock. + * + * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended. + */ +int xe_vma_userptr_check_repin(struct xe_vma *vma) +{ + return mmu_interval_check_retry(&vma->userptr.notifier, + vma->userptr.notifier_seq) ? + -EAGAIN : 0; +} + +int xe_vma_userptr_pin_pages(struct xe_vma *vma) +{ + struct xe_vm *vm = xe_vma_vm(vma); + struct xe_device *xe = vm->xe; + const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT; + struct page **pages; + bool in_kthread = !current->mm; + unsigned long notifier_seq; + int pinned, ret, i; + bool read_only = xe_vma_read_only(vma); + + lockdep_assert_held(&vm->lock); + xe_assert(xe, xe_vma_is_userptr(vma)); +retry: + if (vma->gpuva.flags & XE_VMA_DESTROYED) + return 0; + + notifier_seq = mmu_interval_read_begin(&vma->userptr.notifier); + if (notifier_seq == vma->userptr.notifier_seq) + return 0; + + pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (vma->userptr.sg) { + dma_unmap_sgtable(xe->drm.dev, + vma->userptr.sg, + read_only ? DMA_TO_DEVICE : + DMA_BIDIRECTIONAL, 0); + sg_free_table(vma->userptr.sg); + vma->userptr.sg = NULL; + } + + pinned = ret = 0; + if (in_kthread) { + if (!mmget_not_zero(vma->userptr.notifier.mm)) { + ret = -EFAULT; + goto mm_closed; + } + kthread_use_mm(vma->userptr.notifier.mm); + } + + while (pinned < num_pages) { + ret = get_user_pages_fast(xe_vma_userptr(vma) + + pinned * PAGE_SIZE, + num_pages - pinned, + read_only ? 0 : FOLL_WRITE, + &pages[pinned]); + if (ret < 0) { + if (in_kthread) + ret = 0; + break; + } + + pinned += ret; + ret = 0; + } + + if (in_kthread) { + kthread_unuse_mm(vma->userptr.notifier.mm); + mmput(vma->userptr.notifier.mm); + } +mm_closed: + if (ret) + goto out; + + ret = sg_alloc_table_from_pages_segment(&vma->userptr.sgt, pages, + pinned, 0, + (u64)pinned << PAGE_SHIFT, + xe_sg_segment_size(xe->drm.dev), + GFP_KERNEL); + if (ret) { + vma->userptr.sg = NULL; + goto out; + } + vma->userptr.sg = &vma->userptr.sgt; + + ret = dma_map_sgtable(xe->drm.dev, vma->userptr.sg, + read_only ? DMA_TO_DEVICE : + DMA_BIDIRECTIONAL, + DMA_ATTR_SKIP_CPU_SYNC | + DMA_ATTR_NO_KERNEL_MAPPING); + if (ret) { + sg_free_table(vma->userptr.sg); + vma->userptr.sg = NULL; + goto out; + } + + for (i = 0; i < pinned; ++i) { + if (!read_only) { + lock_page(pages[i]); + set_page_dirty(pages[i]); + unlock_page(pages[i]); + } + + mark_page_accessed(pages[i]); + } + +out: + release_pages(pages, pinned); + kvfree(pages); + + if (!(ret < 0)) { + vma->userptr.notifier_seq = notifier_seq; + if (xe_vma_userptr_check_repin(vma) == -EAGAIN) + goto retry; + } + + return ret < 0 ? ret : 0; +} + +static bool preempt_fences_waiting(struct xe_vm *vm) +{ + struct xe_exec_queue *q; + + lockdep_assert_held(&vm->lock); + xe_vm_assert_held(vm); + + list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) { + if (!q->compute.pfence || + (q->compute.pfence && test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, + &q->compute.pfence->flags))) { + return true; + } + } + + return false; +} + +static void free_preempt_fences(struct list_head *list) +{ + struct list_head *link, *next; + + list_for_each_safe(link, next, list) + xe_preempt_fence_free(to_preempt_fence_from_link(link)); +} + +static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list, + unsigned int *count) +{ + lockdep_assert_held(&vm->lock); + xe_vm_assert_held(vm); + + if (*count >= vm->preempt.num_exec_queues) + return 0; + + for (; *count < vm->preempt.num_exec_queues; ++(*count)) { + struct xe_preempt_fence *pfence = xe_preempt_fence_alloc(); + + if (IS_ERR(pfence)) + return PTR_ERR(pfence); + + list_move_tail(xe_preempt_fence_link(pfence), list); + } + + return 0; +} + +static int wait_for_existing_preempt_fences(struct xe_vm *vm) +{ + struct xe_exec_queue *q; + + xe_vm_assert_held(vm); + + list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) { + if (q->compute.pfence) { + long timeout = dma_fence_wait(q->compute.pfence, false); + + if (timeout < 0) + return -ETIME; + dma_fence_put(q->compute.pfence); + q->compute.pfence = NULL; + } + } + + return 0; +} + +static bool xe_vm_is_idle(struct xe_vm *vm) +{ + struct xe_exec_queue *q; + + xe_vm_assert_held(vm); + list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) { + if (!xe_exec_queue_is_idle(q)) + return false; + } + + return true; +} + +static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list) +{ + struct list_head *link; + struct xe_exec_queue *q; + + list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) { + struct dma_fence *fence; + + link = list->next; + xe_assert(vm->xe, link != list); + + fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link), + q, q->compute.context, + ++q->compute.seqno); + dma_fence_put(q->compute.pfence); + q->compute.pfence = fence; + } +} + +static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo) +{ + struct xe_exec_queue *q; + int err; + + if (!vm->preempt.num_exec_queues) + return 0; + + err = xe_bo_lock(bo, true); + if (err) + return err; + + err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues); + if (err) + goto out_unlock; + + list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) + if (q->compute.pfence) { + dma_resv_add_fence(bo->ttm.base.resv, + q->compute.pfence, + DMA_RESV_USAGE_BOOKKEEP); + } + +out_unlock: + xe_bo_unlock(bo); + return err; +} + +static void resume_and_reinstall_preempt_fences(struct xe_vm *vm, + struct drm_exec *exec) +{ + struct xe_exec_queue *q; + + lockdep_assert_held(&vm->lock); + xe_vm_assert_held(vm); + + list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) { + q->ops->resume(q); + + drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->compute.pfence, + DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP); + } +} + +int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) +{ + struct drm_gpuvm_exec vm_exec = { + .vm = &vm->gpuvm, + .flags = DRM_EXEC_INTERRUPTIBLE_WAIT, + .num_fences = 1, + }; + struct drm_exec *exec = &vm_exec.exec; + struct dma_fence *pfence; + int err; + bool wait; + + xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm)); + + down_write(&vm->lock); + err = drm_gpuvm_exec_lock(&vm_exec); + if (err) + return err; + + pfence = xe_preempt_fence_create(q, q->compute.context, + ++q->compute.seqno); + if (!pfence) { + err = -ENOMEM; + goto out_unlock; + } + + list_add(&q->compute.link, &vm->preempt.exec_queues); + ++vm->preempt.num_exec_queues; + q->compute.pfence = pfence; + + down_read(&vm->userptr.notifier_lock); + + drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence, + DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP); + + /* + * Check to see if a preemption on VM is in flight or userptr + * invalidation, if so trigger this preempt fence to sync state with + * other preempt fences on the VM. + */ + wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm); + if (wait) + dma_fence_enable_sw_signaling(pfence); + + up_read(&vm->userptr.notifier_lock); + +out_unlock: + drm_exec_fini(exec); + up_write(&vm->lock); + + return err; +} + +/** + * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM + * @vm: The VM. + * @q: The exec_queue + */ +void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) +{ + if (!xe_vm_in_preempt_fence_mode(vm)) + return; + + down_write(&vm->lock); + list_del(&q->compute.link); + --vm->preempt.num_exec_queues; + if (q->compute.pfence) { + dma_fence_enable_sw_signaling(q->compute.pfence); + dma_fence_put(q->compute.pfence); + q->compute.pfence = NULL; + } + up_write(&vm->lock); +} + +/** + * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs + * that need repinning. + * @vm: The VM. + * + * This function checks for whether the VM has userptrs that need repinning, + * and provides a release-type barrier on the userptr.notifier_lock after + * checking. + * + * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are. + */ +int __xe_vm_userptr_needs_repin(struct xe_vm *vm) +{ + lockdep_assert_held_read(&vm->userptr.notifier_lock); + + return (list_empty(&vm->userptr.repin_list) && + list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN; +} + +#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000 + +static void xe_vm_kill(struct xe_vm *vm) +{ + struct xe_exec_queue *q; + + lockdep_assert_held(&vm->lock); + + xe_vm_lock(vm, false); + vm->flags |= XE_VM_FLAG_BANNED; + trace_xe_vm_kill(vm); + + list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) + q->ops->kill(q); + xe_vm_unlock(vm); + + /* TODO: Inform user the VM is banned */ +} + +/** + * xe_vm_validate_should_retry() - Whether to retry after a validate error. + * @exec: The drm_exec object used for locking before validation. + * @err: The error returned from ttm_bo_validate(). + * @end: A ktime_t cookie that should be set to 0 before first use and + * that should be reused on subsequent calls. + * + * With multiple active VMs, under memory pressure, it is possible that + * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM. + * Until ttm properly handles locking in such scenarios, best thing the + * driver can do is retry with a timeout. Check if that is necessary, and + * if so unlock the drm_exec's objects while keeping the ticket to prepare + * for a rerun. + * + * Return: true if a retry after drm_exec_init() is recommended; + * false otherwise. + */ +bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end) +{ + ktime_t cur; + + if (err != -ENOMEM) + return false; + + cur = ktime_get(); + *end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS); + if (!ktime_before(cur, *end)) + return false; + + msleep(20); + return true; +} + +static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec) +{ + struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm); + struct drm_gpuva *gpuva; + int ret; + + lockdep_assert_held(&vm->lock); + drm_gpuvm_bo_for_each_va(gpuva, vm_bo) + list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind, + &vm->rebind_list); + + ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false); + if (ret) + return ret; + + vm_bo->evicted = false; + return 0; +} + +static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm, + bool *done) +{ + int err; + + /* + * 1 fence for each preempt fence plus a fence for each tile from a + * possible rebind + */ + err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, vm->preempt.num_exec_queues + + vm->xe->info.tile_count); + if (err) + return err; + + if (xe_vm_is_idle(vm)) { + vm->preempt.rebind_deactivated = true; + *done = true; + return 0; + } + + if (!preempt_fences_waiting(vm)) { + *done = true; + return 0; + } + + err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, vm->preempt.num_exec_queues); + if (err) + return err; + + err = wait_for_existing_preempt_fences(vm); + if (err) + return err; + + return drm_gpuvm_validate(&vm->gpuvm, exec); +} + +static void preempt_rebind_work_func(struct work_struct *w) +{ + struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work); + struct drm_exec exec; + struct dma_fence *rebind_fence; + unsigned int fence_count = 0; + LIST_HEAD(preempt_fences); + ktime_t end = 0; + int err = 0; + long wait; + int __maybe_unused tries = 0; + + xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm)); + trace_xe_vm_rebind_worker_enter(vm); + + down_write(&vm->lock); + + if (xe_vm_is_closed_or_banned(vm)) { + up_write(&vm->lock); + trace_xe_vm_rebind_worker_exit(vm); + return; + } + +retry: + if (xe_vm_userptr_check_repin(vm)) { + err = xe_vm_userptr_pin(vm); + if (err) + goto out_unlock_outer; + } + + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); + + drm_exec_until_all_locked(&exec) { + bool done = false; + + err = xe_preempt_work_begin(&exec, vm, &done); + drm_exec_retry_on_contention(&exec); + if (err || done) { + drm_exec_fini(&exec); + if (err && xe_vm_validate_should_retry(&exec, err, &end)) + err = -EAGAIN; + + goto out_unlock_outer; + } + } + + err = alloc_preempt_fences(vm, &preempt_fences, &fence_count); + if (err) + goto out_unlock; + + rebind_fence = xe_vm_rebind(vm, true); + if (IS_ERR(rebind_fence)) { + err = PTR_ERR(rebind_fence); + goto out_unlock; + } + + if (rebind_fence) { + dma_fence_wait(rebind_fence, false); + dma_fence_put(rebind_fence); + } + + /* Wait on munmap style VM unbinds */ + wait = dma_resv_wait_timeout(xe_vm_resv(vm), + DMA_RESV_USAGE_KERNEL, + false, MAX_SCHEDULE_TIMEOUT); + if (wait <= 0) { + err = -ETIME; + goto out_unlock; + } + +#define retry_required(__tries, __vm) \ + (IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \ + (!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \ + __xe_vm_userptr_needs_repin(__vm)) + + down_read(&vm->userptr.notifier_lock); + if (retry_required(tries, vm)) { + up_read(&vm->userptr.notifier_lock); + err = -EAGAIN; + goto out_unlock; + } + +#undef retry_required + + spin_lock(&vm->xe->ttm.lru_lock); + ttm_lru_bulk_move_tail(&vm->lru_bulk_move); + spin_unlock(&vm->xe->ttm.lru_lock); + + /* Point of no return. */ + arm_preempt_fences(vm, &preempt_fences); + resume_and_reinstall_preempt_fences(vm, &exec); + up_read(&vm->userptr.notifier_lock); + +out_unlock: + drm_exec_fini(&exec); +out_unlock_outer: + if (err == -EAGAIN) { + trace_xe_vm_rebind_worker_retry(vm); + goto retry; + } + + if (err) { + drm_warn(&vm->xe->drm, "VM worker error: %d\n", err); + xe_vm_kill(vm); + } + up_write(&vm->lock); + + free_preempt_fences(&preempt_fences); + + trace_xe_vm_rebind_worker_exit(vm); +} + +static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct xe_vma *vma = container_of(mni, struct xe_vma, userptr.notifier); + struct xe_vm *vm = xe_vma_vm(vma); + struct dma_resv_iter cursor; + struct dma_fence *fence; + long err; + + xe_assert(vm->xe, xe_vma_is_userptr(vma)); + trace_xe_vma_userptr_invalidate(vma); + + if (!mmu_notifier_range_blockable(range)) + return false; + + down_write(&vm->userptr.notifier_lock); + mmu_interval_set_seq(mni, cur_seq); + + /* No need to stop gpu access if the userptr is not yet bound. */ + if (!vma->userptr.initial_bind) { + up_write(&vm->userptr.notifier_lock); + return true; + } + + /* + * Tell exec and rebind worker they need to repin and rebind this + * userptr. + */ + if (!xe_vm_in_fault_mode(vm) && + !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) { + spin_lock(&vm->userptr.invalidated_lock); + list_move_tail(&vma->userptr.invalidate_link, + &vm->userptr.invalidated); + spin_unlock(&vm->userptr.invalidated_lock); + } + + up_write(&vm->userptr.notifier_lock); + + /* + * Preempt fences turn into schedule disables, pipeline these. + * Note that even in fault mode, we need to wait for binds and + * unbinds to complete, and those are attached as BOOKMARK fences + * to the vm. + */ + dma_resv_iter_begin(&cursor, xe_vm_resv(vm), + DMA_RESV_USAGE_BOOKKEEP); + dma_resv_for_each_fence_unlocked(&cursor, fence) + dma_fence_enable_sw_signaling(fence); + dma_resv_iter_end(&cursor); + + err = dma_resv_wait_timeout(xe_vm_resv(vm), + DMA_RESV_USAGE_BOOKKEEP, + false, MAX_SCHEDULE_TIMEOUT); + XE_WARN_ON(err <= 0); + + if (xe_vm_in_fault_mode(vm)) { + err = xe_vm_invalidate_vma(vma); + XE_WARN_ON(err); + } + + trace_xe_vma_userptr_invalidate_complete(vma); + + return true; +} + +static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = { + .invalidate = vma_userptr_invalidate, +}; + +int xe_vm_userptr_pin(struct xe_vm *vm) +{ + struct xe_vma *vma, *next; + int err = 0; + LIST_HEAD(tmp_evict); + + lockdep_assert_held_write(&vm->lock); + + /* Collect invalidated userptrs */ + spin_lock(&vm->userptr.invalidated_lock); + list_for_each_entry_safe(vma, next, &vm->userptr.invalidated, + userptr.invalidate_link) { + list_del_init(&vma->userptr.invalidate_link); + list_move_tail(&vma->combined_links.userptr, + &vm->userptr.repin_list); + } + spin_unlock(&vm->userptr.invalidated_lock); + + /* Pin and move to temporary list */ + list_for_each_entry_safe(vma, next, &vm->userptr.repin_list, + combined_links.userptr) { + err = xe_vma_userptr_pin_pages(vma); + if (err < 0) + return err; + + list_move_tail(&vma->combined_links.userptr, &vm->rebind_list); + } + + return 0; +} + +/** + * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs + * that need repinning. + * @vm: The VM. + * + * This function does an advisory check for whether the VM has userptrs that + * need repinning. + * + * Return: 0 if there are no indications of userptrs needing repinning, + * -EAGAIN if there are. + */ +int xe_vm_userptr_check_repin(struct xe_vm *vm) +{ + return (list_empty_careful(&vm->userptr.repin_list) && + list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN; +} + +static struct dma_fence * +xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q, + struct xe_sync_entry *syncs, u32 num_syncs, + bool first_op, bool last_op); + +struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker) +{ + struct dma_fence *fence = NULL; + struct xe_vma *vma, *next; + + lockdep_assert_held(&vm->lock); + if (xe_vm_in_lr_mode(vm) && !rebind_worker) + return NULL; + + xe_vm_assert_held(vm); + list_for_each_entry_safe(vma, next, &vm->rebind_list, + combined_links.rebind) { + xe_assert(vm->xe, vma->tile_present); + + list_del_init(&vma->combined_links.rebind); + dma_fence_put(fence); + if (rebind_worker) + trace_xe_vma_rebind_worker(vma); + else + trace_xe_vma_rebind_exec(vma); + fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false); + if (IS_ERR(fence)) + return fence; + } + + return fence; +} + +#define VMA_CREATE_FLAG_READ_ONLY BIT(0) +#define VMA_CREATE_FLAG_IS_NULL BIT(1) + +static struct xe_vma *xe_vma_create(struct xe_vm *vm, + struct xe_bo *bo, + u64 bo_offset_or_userptr, + u64 start, u64 end, + u16 pat_index, unsigned int flags) +{ + struct xe_vma *vma; + struct xe_tile *tile; + u8 id; + bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY); + bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL); + + xe_assert(vm->xe, start < end); + xe_assert(vm->xe, end < vm->size); + + if (!bo && !is_null) /* userptr */ + vma = kzalloc(sizeof(*vma), GFP_KERNEL); + else + vma = kzalloc(sizeof(*vma) - sizeof(struct xe_userptr), + GFP_KERNEL); + if (!vma) { + vma = ERR_PTR(-ENOMEM); + return vma; + } + + INIT_LIST_HEAD(&vma->combined_links.rebind); + + INIT_LIST_HEAD(&vma->gpuva.gem.entry); + vma->gpuva.vm = &vm->gpuvm; + vma->gpuva.va.addr = start; + vma->gpuva.va.range = end - start + 1; + if (read_only) + vma->gpuva.flags |= XE_VMA_READ_ONLY; + if (is_null) + vma->gpuva.flags |= DRM_GPUVA_SPARSE; + + for_each_tile(tile, vm->xe, id) + vma->tile_mask |= 0x1 << id; + + if (GRAPHICS_VER(vm->xe) >= 20 || vm->xe->info.platform == XE_PVC) + vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT; + + vma->pat_index = pat_index; + + if (bo) { + struct drm_gpuvm_bo *vm_bo; + + xe_bo_assert_held(bo); + + vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base); + if (IS_ERR(vm_bo)) { + kfree(vma); + return ERR_CAST(vm_bo); + } + + drm_gpuvm_bo_extobj_add(vm_bo); + drm_gem_object_get(&bo->ttm.base); + vma->gpuva.gem.obj = &bo->ttm.base; + vma->gpuva.gem.offset = bo_offset_or_userptr; + drm_gpuva_link(&vma->gpuva, vm_bo); + drm_gpuvm_bo_put(vm_bo); + } else /* userptr or null */ { + if (!is_null) { + u64 size = end - start + 1; + int err; + + INIT_LIST_HEAD(&vma->userptr.invalidate_link); + vma->gpuva.gem.offset = bo_offset_or_userptr; + + err = mmu_interval_notifier_insert(&vma->userptr.notifier, + current->mm, + xe_vma_userptr(vma), size, + &vma_userptr_notifier_ops); + if (err) { + kfree(vma); + vma = ERR_PTR(err); + return vma; + } + + vma->userptr.notifier_seq = LONG_MAX; + } + + xe_vm_get(vm); + } + + return vma; +} + +static void xe_vma_destroy_late(struct xe_vma *vma) +{ + struct xe_vm *vm = xe_vma_vm(vma); + struct xe_device *xe = vm->xe; + bool read_only = xe_vma_read_only(vma); + + if (xe_vma_is_userptr(vma)) { + if (vma->userptr.sg) { + dma_unmap_sgtable(xe->drm.dev, + vma->userptr.sg, + read_only ? DMA_TO_DEVICE : + DMA_BIDIRECTIONAL, 0); + sg_free_table(vma->userptr.sg); + vma->userptr.sg = NULL; + } + + /* + * Since userptr pages are not pinned, we can't remove + * the notifer until we're sure the GPU is not accessing + * them anymore + */ + mmu_interval_notifier_remove(&vma->userptr.notifier); + xe_vm_put(vm); + } else if (xe_vma_is_null(vma)) { + xe_vm_put(vm); + } else { + xe_bo_put(xe_vma_bo(vma)); + } + + kfree(vma); +} + +static void vma_destroy_work_func(struct work_struct *w) +{ + struct xe_vma *vma = + container_of(w, struct xe_vma, destroy_work); + + xe_vma_destroy_late(vma); +} + +static void vma_destroy_cb(struct dma_fence *fence, + struct dma_fence_cb *cb) +{ + struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb); + + INIT_WORK(&vma->destroy_work, vma_destroy_work_func); + queue_work(system_unbound_wq, &vma->destroy_work); +} + +static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence) +{ + struct xe_vm *vm = xe_vma_vm(vma); + + lockdep_assert_held_write(&vm->lock); + xe_assert(vm->xe, list_empty(&vma->combined_links.destroy)); + + if (xe_vma_is_userptr(vma)) { + xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED); + + spin_lock(&vm->userptr.invalidated_lock); + list_del(&vma->userptr.invalidate_link); + spin_unlock(&vm->userptr.invalidated_lock); + } else if (!xe_vma_is_null(vma)) { + xe_bo_assert_held(xe_vma_bo(vma)); + + drm_gpuva_unlink(&vma->gpuva); + } + + xe_vm_assert_held(vm); + if (fence) { + int ret = dma_fence_add_callback(fence, &vma->destroy_cb, + vma_destroy_cb); + + if (ret) { + XE_WARN_ON(ret != -ENOENT); + xe_vma_destroy_late(vma); + } + } else { + xe_vma_destroy_late(vma); + } +} + +/** + * xe_vm_prepare_vma() - drm_exec utility to lock a vma + * @exec: The drm_exec object we're currently locking for. + * @vma: The vma for witch we want to lock the vm resv and any attached + * object's resv. + * @num_shared: The number of dma-fence slots to pre-allocate in the + * objects' reservation objects. + * + * Return: 0 on success, negative error code on error. In particular + * may return -EDEADLK on WW transaction contention and -EINTR if + * an interruptible wait is terminated by a signal. + */ +int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma, + unsigned int num_shared) +{ + struct xe_vm *vm = xe_vma_vm(vma); + struct xe_bo *bo = xe_vma_bo(vma); + int err; + + XE_WARN_ON(!vm); + err = drm_exec_prepare_obj(exec, xe_vm_obj(vm), num_shared); + if (!err && bo && !bo->vm) + err = drm_exec_prepare_obj(exec, &bo->ttm.base, num_shared); + + return err; +} + +static void xe_vma_destroy_unlocked(struct xe_vma *vma) +{ + struct drm_exec exec; + int err; + + drm_exec_init(&exec, 0, 0); + drm_exec_until_all_locked(&exec) { + err = xe_vm_prepare_vma(&exec, vma, 0); + drm_exec_retry_on_contention(&exec); + if (XE_WARN_ON(err)) + break; + } + + xe_vma_destroy(vma, NULL); + + drm_exec_fini(&exec); +} + +struct xe_vma * +xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range) +{ + struct drm_gpuva *gpuva; + + lockdep_assert_held(&vm->lock); + + if (xe_vm_is_closed_or_banned(vm)) + return NULL; + + xe_assert(vm->xe, start + range <= vm->size); + + gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range); + + return gpuva ? gpuva_to_vma(gpuva) : NULL; +} + +static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma) +{ + int err; + + xe_assert(vm->xe, xe_vma_vm(vma) == vm); + lockdep_assert_held(&vm->lock); + + err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva); + XE_WARN_ON(err); /* Shouldn't be possible */ + + return err; +} + +static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma) +{ + xe_assert(vm->xe, xe_vma_vm(vma) == vm); + lockdep_assert_held(&vm->lock); + + drm_gpuva_remove(&vma->gpuva); + if (vm->usm.last_fault_vma == vma) + vm->usm.last_fault_vma = NULL; +} + +static struct drm_gpuva_op *xe_vm_op_alloc(void) +{ + struct xe_vma_op *op; + + op = kzalloc(sizeof(*op), GFP_KERNEL); + + if (unlikely(!op)) + return NULL; + + return &op->base; +} + +static void xe_vm_free(struct drm_gpuvm *gpuvm); + +static struct drm_gpuvm_ops gpuvm_ops = { + .op_alloc = xe_vm_op_alloc, + .vm_bo_validate = xe_gpuvm_validate, + .vm_free = xe_vm_free, +}; + +static u64 pde_encode_pat_index(struct xe_device *xe, u16 pat_index) +{ + u64 pte = 0; + + if (pat_index & BIT(0)) + pte |= XE_PPGTT_PTE_PAT0; + + if (pat_index & BIT(1)) + pte |= XE_PPGTT_PTE_PAT1; + + return pte; +} + +static u64 pte_encode_pat_index(struct xe_device *xe, u16 pat_index, + u32 pt_level) +{ + u64 pte = 0; + + if (pat_index & BIT(0)) + pte |= XE_PPGTT_PTE_PAT0; + + if (pat_index & BIT(1)) + pte |= XE_PPGTT_PTE_PAT1; + + if (pat_index & BIT(2)) { + if (pt_level) + pte |= XE_PPGTT_PDE_PDPE_PAT2; + else + pte |= XE_PPGTT_PTE_PAT2; + } + + if (pat_index & BIT(3)) + pte |= XELPG_PPGTT_PTE_PAT3; + + if (pat_index & (BIT(4))) + pte |= XE2_PPGTT_PTE_PAT4; + + return pte; +} + +static u64 pte_encode_ps(u32 pt_level) +{ + XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL); + + if (pt_level == 1) + return XE_PDE_PS_2M; + else if (pt_level == 2) + return XE_PDPE_PS_1G; + + return 0; +} + +static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset, + const u16 pat_index) +{ + struct xe_device *xe = xe_bo_device(bo); + u64 pde; + + pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE); + pde |= XE_PAGE_PRESENT | XE_PAGE_RW; + pde |= pde_encode_pat_index(xe, pat_index); + + return pde; +} + +static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset, + u16 pat_index, u32 pt_level) +{ + struct xe_device *xe = xe_bo_device(bo); + u64 pte; + + pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE); + pte |= XE_PAGE_PRESENT | XE_PAGE_RW; + pte |= pte_encode_pat_index(xe, pat_index, pt_level); + pte |= pte_encode_ps(pt_level); + + if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo)) + pte |= XE_PPGTT_PTE_DM; + + return pte; +} + +static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma, + u16 pat_index, u32 pt_level) +{ + struct xe_device *xe = xe_vma_vm(vma)->xe; + + pte |= XE_PAGE_PRESENT; + + if (likely(!xe_vma_read_only(vma))) + pte |= XE_PAGE_RW; + + pte |= pte_encode_pat_index(xe, pat_index, pt_level); + pte |= pte_encode_ps(pt_level); + + if (unlikely(xe_vma_is_null(vma))) + pte |= XE_PTE_NULL; + + return pte; +} + +static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr, + u16 pat_index, + u32 pt_level, bool devmem, u64 flags) +{ + u64 pte; + + /* Avoid passing random bits directly as flags */ + xe_assert(xe, !(flags & ~XE_PTE_PS64)); + + pte = addr; + pte |= XE_PAGE_PRESENT | XE_PAGE_RW; + pte |= pte_encode_pat_index(xe, pat_index, pt_level); + pte |= pte_encode_ps(pt_level); + + if (devmem) + pte |= XE_PPGTT_PTE_DM; + + pte |= flags; + + return pte; +} + +static const struct xe_pt_ops xelp_pt_ops = { + .pte_encode_bo = xelp_pte_encode_bo, + .pte_encode_vma = xelp_pte_encode_vma, + .pte_encode_addr = xelp_pte_encode_addr, + .pde_encode_bo = xelp_pde_encode_bo, +}; + +static void vm_destroy_work_func(struct work_struct *w); + +/** + * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the + * given tile and vm. + * @xe: xe device. + * @tile: tile to set up for. + * @vm: vm to set up for. + * + * Sets up a pagetable tree with one page-table per level and a single + * leaf PTE. All pagetable entries point to the single page-table or, + * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and + * writes become NOPs. + * + * Return: 0 on success, negative error code on error. + */ +static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile, + struct xe_vm *vm) +{ + u8 id = tile->id; + int i; + + for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) { + vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i); + if (IS_ERR(vm->scratch_pt[id][i])) + return PTR_ERR(vm->scratch_pt[id][i]); + + xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]); + } + + return 0; +} + +static void xe_vm_free_scratch(struct xe_vm *vm) +{ + struct xe_tile *tile; + u8 id; + + if (!xe_vm_has_scratch(vm)) + return; + + for_each_tile(tile, vm->xe, id) { + u32 i; + + if (!vm->pt_root[id]) + continue; + + for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i) + if (vm->scratch_pt[id][i]) + xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL); + } +} + +struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags) +{ + struct drm_gem_object *vm_resv_obj; + struct xe_vm *vm; + int err, number_tiles = 0; + struct xe_tile *tile; + u8 id; + + vm = kzalloc(sizeof(*vm), GFP_KERNEL); + if (!vm) + return ERR_PTR(-ENOMEM); + + vm->xe = xe; + + vm->size = 1ull << xe->info.va_bits; + + vm->flags = flags; + + init_rwsem(&vm->lock); + + INIT_LIST_HEAD(&vm->rebind_list); + + INIT_LIST_HEAD(&vm->userptr.repin_list); + INIT_LIST_HEAD(&vm->userptr.invalidated); + init_rwsem(&vm->userptr.notifier_lock); + spin_lock_init(&vm->userptr.invalidated_lock); + + INIT_WORK(&vm->destroy_work, vm_destroy_work_func); + + INIT_LIST_HEAD(&vm->preempt.exec_queues); + vm->preempt.min_run_period_ms = 10; /* FIXME: Wire up to uAPI */ + + for_each_tile(tile, xe, id) + xe_range_fence_tree_init(&vm->rftree[id]); + + vm->pt_ops = &xelp_pt_ops; + + if (!(flags & XE_VM_FLAG_MIGRATION)) + xe_device_mem_access_get(xe); + + vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm); + if (!vm_resv_obj) { + err = -ENOMEM; + goto err_no_resv; + } + + drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm, + vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops); + + drm_gem_object_put(vm_resv_obj); + + err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL); + if (err) + goto err_close; + + if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K) + vm->flags |= XE_VM_FLAG_64K; + + for_each_tile(tile, xe, id) { + if (flags & XE_VM_FLAG_MIGRATION && + tile->id != XE_VM_FLAG_TILE_ID(flags)) + continue; + + vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level); + if (IS_ERR(vm->pt_root[id])) { + err = PTR_ERR(vm->pt_root[id]); + vm->pt_root[id] = NULL; + goto err_unlock_close; + } + } + + if (xe_vm_has_scratch(vm)) { + for_each_tile(tile, xe, id) { + if (!vm->pt_root[id]) + continue; + + err = xe_vm_create_scratch(xe, tile, vm); + if (err) + goto err_unlock_close; + } + vm->batch_invalidate_tlb = true; + } + + if (flags & XE_VM_FLAG_LR_MODE) { + INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func); + vm->flags |= XE_VM_FLAG_LR_MODE; + vm->batch_invalidate_tlb = false; + } + + /* Fill pt_root after allocating scratch tables */ + for_each_tile(tile, xe, id) { + if (!vm->pt_root[id]) + continue; + + xe_pt_populate_empty(tile, vm, vm->pt_root[id]); + } + dma_resv_unlock(xe_vm_resv(vm)); + + /* Kernel migration VM shouldn't have a circular loop.. */ + if (!(flags & XE_VM_FLAG_MIGRATION)) { + for_each_tile(tile, xe, id) { + struct xe_gt *gt = tile->primary_gt; + struct xe_vm *migrate_vm; + struct xe_exec_queue *q; + u32 create_flags = EXEC_QUEUE_FLAG_VM; + + if (!vm->pt_root[id]) + continue; + + migrate_vm = xe_migrate_get_vm(tile->migrate); + q = xe_exec_queue_create_class(xe, gt, migrate_vm, + XE_ENGINE_CLASS_COPY, + create_flags); + xe_vm_put(migrate_vm); + if (IS_ERR(q)) { + err = PTR_ERR(q); + goto err_close; + } + vm->q[id] = q; + number_tiles++; + } + } + + if (number_tiles > 1) + vm->composite_fence_ctx = dma_fence_context_alloc(1); + + mutex_lock(&xe->usm.lock); + if (flags & XE_VM_FLAG_FAULT_MODE) + xe->usm.num_vm_in_fault_mode++; + else if (!(flags & XE_VM_FLAG_MIGRATION)) + xe->usm.num_vm_in_non_fault_mode++; + mutex_unlock(&xe->usm.lock); + + trace_xe_vm_create(vm); + + return vm; + +err_unlock_close: + dma_resv_unlock(xe_vm_resv(vm)); +err_close: + xe_vm_close_and_put(vm); + return ERR_PTR(err); + +err_no_resv: + for_each_tile(tile, xe, id) + xe_range_fence_tree_fini(&vm->rftree[id]); + kfree(vm); + if (!(flags & XE_VM_FLAG_MIGRATION)) + xe_device_mem_access_put(xe); + return ERR_PTR(err); +} + +static void xe_vm_close(struct xe_vm *vm) +{ + down_write(&vm->lock); + vm->size = 0; + up_write(&vm->lock); +} + +void xe_vm_close_and_put(struct xe_vm *vm) +{ + LIST_HEAD(contested); + struct xe_device *xe = vm->xe; + struct xe_tile *tile; + struct xe_vma *vma, *next_vma; + struct drm_gpuva *gpuva, *next; + u8 id; + + xe_assert(xe, !vm->preempt.num_exec_queues); + + xe_vm_close(vm); + if (xe_vm_in_preempt_fence_mode(vm)) + flush_work(&vm->preempt.rebind_work); + + down_write(&vm->lock); + for_each_tile(tile, xe, id) { + if (vm->q[id]) + xe_exec_queue_last_fence_put(vm->q[id], vm); + } + up_write(&vm->lock); + + for_each_tile(tile, xe, id) { + if (vm->q[id]) { + xe_exec_queue_kill(vm->q[id]); + xe_exec_queue_put(vm->q[id]); + vm->q[id] = NULL; + } + } + + down_write(&vm->lock); + xe_vm_lock(vm, false); + drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) { + vma = gpuva_to_vma(gpuva); + + if (xe_vma_has_no_bo(vma)) { + down_read(&vm->userptr.notifier_lock); + vma->gpuva.flags |= XE_VMA_DESTROYED; + up_read(&vm->userptr.notifier_lock); + } + + xe_vm_remove_vma(vm, vma); + + /* easy case, remove from VMA? */ + if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) { + list_del_init(&vma->combined_links.rebind); + xe_vma_destroy(vma, NULL); + continue; + } + + list_move_tail(&vma->combined_links.destroy, &contested); + vma->gpuva.flags |= XE_VMA_DESTROYED; + } + + /* + * All vm operations will add shared fences to resv. + * The only exception is eviction for a shared object, + * but even so, the unbind when evicted would still + * install a fence to resv. Hence it's safe to + * destroy the pagetables immediately. + */ + xe_vm_free_scratch(vm); + + for_each_tile(tile, xe, id) { + if (vm->pt_root[id]) { + xe_pt_destroy(vm->pt_root[id], vm->flags, NULL); + vm->pt_root[id] = NULL; + } + } + xe_vm_unlock(vm); + + /* + * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL + * Since we hold a refcount to the bo, we can remove and free + * the members safely without locking. + */ + list_for_each_entry_safe(vma, next_vma, &contested, + combined_links.destroy) { + list_del_init(&vma->combined_links.destroy); + xe_vma_destroy_unlocked(vma); + } + + up_write(&vm->lock); + + mutex_lock(&xe->usm.lock); + if (vm->flags & XE_VM_FLAG_FAULT_MODE) + xe->usm.num_vm_in_fault_mode--; + else if (!(vm->flags & XE_VM_FLAG_MIGRATION)) + xe->usm.num_vm_in_non_fault_mode--; + mutex_unlock(&xe->usm.lock); + + for_each_tile(tile, xe, id) + xe_range_fence_tree_fini(&vm->rftree[id]); + + xe_vm_put(vm); +} + +static void vm_destroy_work_func(struct work_struct *w) +{ + struct xe_vm *vm = + container_of(w, struct xe_vm, destroy_work); + struct xe_device *xe = vm->xe; + struct xe_tile *tile; + u8 id; + void *lookup; + + /* xe_vm_close_and_put was not called? */ + xe_assert(xe, !vm->size); + + if (!(vm->flags & XE_VM_FLAG_MIGRATION)) { + xe_device_mem_access_put(xe); + + if (xe->info.has_asid && vm->usm.asid) { + mutex_lock(&xe->usm.lock); + lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid); + xe_assert(xe, lookup == vm); + mutex_unlock(&xe->usm.lock); + } + } + + for_each_tile(tile, xe, id) + XE_WARN_ON(vm->pt_root[id]); + + trace_xe_vm_free(vm); + dma_fence_put(vm->rebind_fence); + kfree(vm); +} + +static void xe_vm_free(struct drm_gpuvm *gpuvm) +{ + struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm); + + /* To destroy the VM we need to be able to sleep */ + queue_work(system_unbound_wq, &vm->destroy_work); +} + +struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id) +{ + struct xe_vm *vm; + + mutex_lock(&xef->vm.lock); + vm = xa_load(&xef->vm.xa, id); + if (vm) + xe_vm_get(vm); + mutex_unlock(&xef->vm.lock); + + return vm; +} + +u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile) +{ + return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0, + tile_to_xe(tile)->pat.idx[XE_CACHE_WB]); +} + +static struct xe_exec_queue * +to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q) +{ + return q ? q : vm->q[0]; +} + +static struct dma_fence * +xe_vm_unbind_vma(struct xe_vma *vma, struct xe_exec_queue *q, + struct xe_sync_entry *syncs, u32 num_syncs, + bool first_op, bool last_op) +{ + struct xe_vm *vm = xe_vma_vm(vma); + struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q); + struct xe_tile *tile; + struct dma_fence *fence = NULL; + struct dma_fence **fences = NULL; + struct dma_fence_array *cf = NULL; + int cur_fence = 0, i; + int number_tiles = hweight8(vma->tile_present); + int err; + u8 id; + + trace_xe_vma_unbind(vma); + + if (number_tiles > 1) { + fences = kmalloc_array(number_tiles, sizeof(*fences), + GFP_KERNEL); + if (!fences) + return ERR_PTR(-ENOMEM); + } + + for_each_tile(tile, vm->xe, id) { + if (!(vma->tile_present & BIT(id))) + goto next; + + fence = __xe_pt_unbind_vma(tile, vma, q ? q : vm->q[id], + first_op ? syncs : NULL, + first_op ? num_syncs : 0); + if (IS_ERR(fence)) { + err = PTR_ERR(fence); + goto err_fences; + } + + if (fences) + fences[cur_fence++] = fence; + +next: + if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list)) + q = list_next_entry(q, multi_gt_list); + } + + if (fences) { + cf = dma_fence_array_create(number_tiles, fences, + vm->composite_fence_ctx, + vm->composite_fence_seqno++, + false); + if (!cf) { + --vm->composite_fence_seqno; + err = -ENOMEM; + goto err_fences; + } + } + + fence = cf ? &cf->base : !fence ? + xe_exec_queue_last_fence_get(wait_exec_queue, vm) : fence; + if (last_op) { + for (i = 0; i < num_syncs; i++) + xe_sync_entry_signal(&syncs[i], NULL, fence); + } + + return fence; + +err_fences: + if (fences) { + while (cur_fence) + dma_fence_put(fences[--cur_fence]); + kfree(fences); + } + + return ERR_PTR(err); +} + +static struct dma_fence * +xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q, + struct xe_sync_entry *syncs, u32 num_syncs, + bool first_op, bool last_op) +{ + struct xe_tile *tile; + struct dma_fence *fence; + struct dma_fence **fences = NULL; + struct dma_fence_array *cf = NULL; + struct xe_vm *vm = xe_vma_vm(vma); + int cur_fence = 0, i; + int number_tiles = hweight8(vma->tile_mask); + int err; + u8 id; + + trace_xe_vma_bind(vma); + + if (number_tiles > 1) { + fences = kmalloc_array(number_tiles, sizeof(*fences), + GFP_KERNEL); + if (!fences) + return ERR_PTR(-ENOMEM); + } + + for_each_tile(tile, vm->xe, id) { + if (!(vma->tile_mask & BIT(id))) + goto next; + + fence = __xe_pt_bind_vma(tile, vma, q ? q : vm->q[id], + first_op ? syncs : NULL, + first_op ? num_syncs : 0, + vma->tile_present & BIT(id)); + if (IS_ERR(fence)) { + err = PTR_ERR(fence); + goto err_fences; + } + + if (fences) + fences[cur_fence++] = fence; + +next: + if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list)) + q = list_next_entry(q, multi_gt_list); + } + + if (fences) { + cf = dma_fence_array_create(number_tiles, fences, + vm->composite_fence_ctx, + vm->composite_fence_seqno++, + false); + if (!cf) { + --vm->composite_fence_seqno; + err = -ENOMEM; + goto err_fences; + } + } + + if (last_op) { + for (i = 0; i < num_syncs; i++) + xe_sync_entry_signal(&syncs[i], NULL, + cf ? &cf->base : fence); + } + + return cf ? &cf->base : fence; + +err_fences: + if (fences) { + while (cur_fence) + dma_fence_put(fences[--cur_fence]); + kfree(fences); + } + + return ERR_PTR(err); +} + +static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, + struct xe_exec_queue *q, struct xe_sync_entry *syncs, + u32 num_syncs, bool immediate, bool first_op, + bool last_op) +{ + struct dma_fence *fence; + struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q); + + xe_vm_assert_held(vm); + + if (immediate) { + fence = xe_vm_bind_vma(vma, q, syncs, num_syncs, first_op, + last_op); + if (IS_ERR(fence)) + return PTR_ERR(fence); + } else { + int i; + + xe_assert(vm->xe, xe_vm_in_fault_mode(vm)); + + fence = xe_exec_queue_last_fence_get(wait_exec_queue, vm); + if (last_op) { + for (i = 0; i < num_syncs; i++) + xe_sync_entry_signal(&syncs[i], NULL, fence); + } + } + + if (last_op) + xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence); + dma_fence_put(fence); + + return 0; +} + +static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q, + struct xe_bo *bo, struct xe_sync_entry *syncs, + u32 num_syncs, bool immediate, bool first_op, + bool last_op) +{ + int err; + + xe_vm_assert_held(vm); + xe_bo_assert_held(bo); + + if (bo && immediate) { + err = xe_bo_validate(bo, vm, true); + if (err) + return err; + } + + return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op, + last_op); +} + +static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma, + struct xe_exec_queue *q, struct xe_sync_entry *syncs, + u32 num_syncs, bool first_op, bool last_op) +{ + struct dma_fence *fence; + struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q); + + xe_vm_assert_held(vm); + xe_bo_assert_held(xe_vma_bo(vma)); + + fence = xe_vm_unbind_vma(vma, q, syncs, num_syncs, first_op, last_op); + if (IS_ERR(fence)) + return PTR_ERR(fence); + + xe_vma_destroy(vma, fence); + if (last_op) + xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence); + dma_fence_put(fence); + + return 0; +} + +#define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \ + DRM_XE_VM_CREATE_FLAG_LR_MODE | \ + DRM_XE_VM_CREATE_FLAG_FAULT_MODE) + +int xe_vm_create_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = to_xe_file(file); + struct drm_xe_vm_create *args = data; + struct xe_tile *tile; + struct xe_vm *vm; + u32 id, asid; + int err; + u32 flags = 0; + + if (XE_IOCTL_DBG(xe, args->extensions)) + return -EINVAL; + + if (XE_WA(xe_root_mmio_gt(xe), 14016763929)) + args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE; + + if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE && + !xe->info.has_usm)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE && + args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) && + args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE && + xe_device_in_non_fault_mode(xe))) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) && + xe_device_in_fault_mode(xe))) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->extensions)) + return -EINVAL; + + if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE) + flags |= XE_VM_FLAG_SCRATCH_PAGE; + if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) + flags |= XE_VM_FLAG_LR_MODE; + if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) + flags |= XE_VM_FLAG_FAULT_MODE; + + vm = xe_vm_create(xe, flags); + if (IS_ERR(vm)) + return PTR_ERR(vm); + + mutex_lock(&xef->vm.lock); + err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL); + mutex_unlock(&xef->vm.lock); + if (err) { + xe_vm_close_and_put(vm); + return err; + } + + if (xe->info.has_asid) { + mutex_lock(&xe->usm.lock); + err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm, + XA_LIMIT(1, XE_MAX_ASID - 1), + &xe->usm.next_asid, GFP_KERNEL); + mutex_unlock(&xe->usm.lock); + if (err < 0) { + xe_vm_close_and_put(vm); + return err; + } + err = 0; + vm->usm.asid = asid; + } + + args->vm_id = id; + vm->xef = xef; + + /* Record BO memory for VM pagetable created against client */ + for_each_tile(tile, xe, id) + if (vm->pt_root[id]) + xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo); + +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM) + /* Warning: Security issue - never enable by default */ + args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE); +#endif + + return 0; +} + +int xe_vm_destroy_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = to_xe_file(file); + struct drm_xe_vm_destroy *args = data; + struct xe_vm *vm; + int err = 0; + + if (XE_IOCTL_DBG(xe, args->pad) || + XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + mutex_lock(&xef->vm.lock); + vm = xa_load(&xef->vm.xa, args->vm_id); + if (XE_IOCTL_DBG(xe, !vm)) + err = -ENOENT; + else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues)) + err = -EBUSY; + else + xa_erase(&xef->vm.xa, args->vm_id); + mutex_unlock(&xef->vm.lock); + + if (!err) + xe_vm_close_and_put(vm); + + return err; +} + +static const u32 region_to_mem_type[] = { + XE_PL_TT, + XE_PL_VRAM0, + XE_PL_VRAM1, +}; + +static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma, + struct xe_exec_queue *q, u32 region, + struct xe_sync_entry *syncs, u32 num_syncs, + bool first_op, bool last_op) +{ + struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q); + int err; + + xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type)); + + if (!xe_vma_has_no_bo(vma)) { + err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]); + if (err) + return err; + } + + if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) { + return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs, + true, first_op, last_op); + } else { + int i; + + /* Nothing to do, signal fences now */ + if (last_op) { + for (i = 0; i < num_syncs; i++) { + struct dma_fence *fence = + xe_exec_queue_last_fence_get(wait_exec_queue, vm); + + xe_sync_entry_signal(&syncs[i], NULL, fence); + } + } + + return 0; + } +} + +static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma, + bool post_commit) +{ + down_read(&vm->userptr.notifier_lock); + vma->gpuva.flags |= XE_VMA_DESTROYED; + up_read(&vm->userptr.notifier_lock); + if (post_commit) + xe_vm_remove_vma(vm, vma); +} + +#undef ULL +#define ULL unsigned long long + +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM) +static void print_op(struct xe_device *xe, struct drm_gpuva_op *op) +{ + struct xe_vma *vma; + + switch (op->op) { + case DRM_GPUVA_OP_MAP: + vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx", + (ULL)op->map.va.addr, (ULL)op->map.va.range); + break; + case DRM_GPUVA_OP_REMAP: + vma = gpuva_to_vma(op->remap.unmap->va); + vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d", + (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma), + op->remap.unmap->keep ? 1 : 0); + if (op->remap.prev) + vm_dbg(&xe->drm, + "REMAP:PREV: addr=0x%016llx, range=0x%016llx", + (ULL)op->remap.prev->va.addr, + (ULL)op->remap.prev->va.range); + if (op->remap.next) + vm_dbg(&xe->drm, + "REMAP:NEXT: addr=0x%016llx, range=0x%016llx", + (ULL)op->remap.next->va.addr, + (ULL)op->remap.next->va.range); + break; + case DRM_GPUVA_OP_UNMAP: + vma = gpuva_to_vma(op->unmap.va); + vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d", + (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma), + op->unmap.keep ? 1 : 0); + break; + case DRM_GPUVA_OP_PREFETCH: + vma = gpuva_to_vma(op->prefetch.va); + vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx", + (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma)); + break; + default: + drm_warn(&xe->drm, "NOT POSSIBLE"); + } +} +#else +static void print_op(struct xe_device *xe, struct drm_gpuva_op *op) +{ +} +#endif + +/* + * Create operations list from IOCTL arguments, setup operations fields so parse + * and commit steps are decoupled from IOCTL arguments. This step can fail. + */ +static struct drm_gpuva_ops * +vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo, + u64 bo_offset_or_userptr, u64 addr, u64 range, + u32 operation, u32 flags, + u32 prefetch_region, u16 pat_index) +{ + struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL; + struct drm_gpuva_ops *ops; + struct drm_gpuva_op *__op; + struct xe_vma_op *op; + struct drm_gpuvm_bo *vm_bo; + int err; + + lockdep_assert_held_write(&vm->lock); + + vm_dbg(&vm->xe->drm, + "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx", + operation, (ULL)addr, (ULL)range, + (ULL)bo_offset_or_userptr); + + switch (operation) { + case DRM_XE_VM_BIND_OP_MAP: + case DRM_XE_VM_BIND_OP_MAP_USERPTR: + ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range, + obj, bo_offset_or_userptr); + break; + case DRM_XE_VM_BIND_OP_UNMAP: + ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range); + break; + case DRM_XE_VM_BIND_OP_PREFETCH: + ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range); + break; + case DRM_XE_VM_BIND_OP_UNMAP_ALL: + xe_assert(vm->xe, bo); + + err = xe_bo_lock(bo, true); + if (err) + return ERR_PTR(err); + + vm_bo = drm_gpuvm_bo_find(&vm->gpuvm, obj); + if (!vm_bo) + break; + + ops = drm_gpuvm_bo_unmap_ops_create(vm_bo); + drm_gpuvm_bo_put(vm_bo); + xe_bo_unlock(bo); + break; + default: + drm_warn(&vm->xe->drm, "NOT POSSIBLE"); + ops = ERR_PTR(-EINVAL); + } + if (IS_ERR(ops)) + return ops; + +#ifdef TEST_VM_ASYNC_OPS_ERROR + if (operation & FORCE_ASYNC_OP_ERROR) { + op = list_first_entry_or_null(&ops->list, struct xe_vma_op, + base.entry); + if (op) + op->inject_error = true; + } +#endif + + drm_gpuva_for_each_op(__op, ops) { + struct xe_vma_op *op = gpuva_op_to_vma_op(__op); + + if (__op->op == DRM_GPUVA_OP_MAP) { + op->map.immediate = + flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE; + op->map.read_only = + flags & DRM_XE_VM_BIND_FLAG_READONLY; + op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL; + op->map.pat_index = pat_index; + } else if (__op->op == DRM_GPUVA_OP_PREFETCH) { + op->prefetch.region = prefetch_region; + } + + print_op(vm->xe, __op); + } + + return ops; +} + +static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op, + u16 pat_index, unsigned int flags) +{ + struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL; + struct drm_exec exec; + struct xe_vma *vma; + int err; + + lockdep_assert_held_write(&vm->lock); + + if (bo) { + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); + drm_exec_until_all_locked(&exec) { + err = 0; + if (!bo->vm) { + err = drm_exec_lock_obj(&exec, xe_vm_obj(vm)); + drm_exec_retry_on_contention(&exec); + } + if (!err) { + err = drm_exec_lock_obj(&exec, &bo->ttm.base); + drm_exec_retry_on_contention(&exec); + } + if (err) { + drm_exec_fini(&exec); + return ERR_PTR(err); + } + } + } + vma = xe_vma_create(vm, bo, op->gem.offset, + op->va.addr, op->va.addr + + op->va.range - 1, pat_index, flags); + if (bo) + drm_exec_fini(&exec); + + if (xe_vma_is_userptr(vma)) { + err = xe_vma_userptr_pin_pages(vma); + if (err) { + prep_vma_destroy(vm, vma, false); + xe_vma_destroy_unlocked(vma); + return ERR_PTR(err); + } + } else if (!xe_vma_has_no_bo(vma) && !bo->vm) { + err = add_preempt_fences(vm, bo); + if (err) { + prep_vma_destroy(vm, vma, false); + xe_vma_destroy_unlocked(vma); + return ERR_PTR(err); + } + } + + return vma; +} + +static u64 xe_vma_max_pte_size(struct xe_vma *vma) +{ + if (vma->gpuva.flags & XE_VMA_PTE_1G) + return SZ_1G; + else if (vma->gpuva.flags & XE_VMA_PTE_2M) + return SZ_2M; + + return SZ_4K; +} + +static u64 xe_vma_set_pte_size(struct xe_vma *vma, u64 size) +{ + switch (size) { + case SZ_1G: + vma->gpuva.flags |= XE_VMA_PTE_1G; + break; + case SZ_2M: + vma->gpuva.flags |= XE_VMA_PTE_2M; + break; + } + + return SZ_4K; +} + +static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op) +{ + int err = 0; + + lockdep_assert_held_write(&vm->lock); + + switch (op->base.op) { + case DRM_GPUVA_OP_MAP: + err |= xe_vm_insert_vma(vm, op->map.vma); + if (!err) + op->flags |= XE_VMA_OP_COMMITTED; + break; + case DRM_GPUVA_OP_REMAP: + { + u8 tile_present = + gpuva_to_vma(op->base.remap.unmap->va)->tile_present; + + prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va), + true); + op->flags |= XE_VMA_OP_COMMITTED; + + if (op->remap.prev) { + err |= xe_vm_insert_vma(vm, op->remap.prev); + if (!err) + op->flags |= XE_VMA_OP_PREV_COMMITTED; + if (!err && op->remap.skip_prev) { + op->remap.prev->tile_present = + tile_present; + op->remap.prev = NULL; + } + } + if (op->remap.next) { + err |= xe_vm_insert_vma(vm, op->remap.next); + if (!err) + op->flags |= XE_VMA_OP_NEXT_COMMITTED; + if (!err && op->remap.skip_next) { + op->remap.next->tile_present = + tile_present; + op->remap.next = NULL; + } + } + + /* Adjust for partial unbind after removin VMA from VM */ + if (!err) { + op->base.remap.unmap->va->va.addr = op->remap.start; + op->base.remap.unmap->va->va.range = op->remap.range; + } + break; + } + case DRM_GPUVA_OP_UNMAP: + prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true); + op->flags |= XE_VMA_OP_COMMITTED; + break; + case DRM_GPUVA_OP_PREFETCH: + op->flags |= XE_VMA_OP_COMMITTED; + break; + default: + drm_warn(&vm->xe->drm, "NOT POSSIBLE"); + } + + return err; +} + + +static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q, + struct drm_gpuva_ops *ops, + struct xe_sync_entry *syncs, u32 num_syncs, + struct list_head *ops_list, bool last) +{ + struct xe_vma_op *last_op = NULL; + struct drm_gpuva_op *__op; + int err = 0; + + lockdep_assert_held_write(&vm->lock); + + drm_gpuva_for_each_op(__op, ops) { + struct xe_vma_op *op = gpuva_op_to_vma_op(__op); + struct xe_vma *vma; + bool first = list_empty(ops_list); + unsigned int flags = 0; + + INIT_LIST_HEAD(&op->link); + list_add_tail(&op->link, ops_list); + + if (first) { + op->flags |= XE_VMA_OP_FIRST; + op->num_syncs = num_syncs; + op->syncs = syncs; + } + + op->q = q; + + switch (op->base.op) { + case DRM_GPUVA_OP_MAP: + { + flags |= op->map.read_only ? + VMA_CREATE_FLAG_READ_ONLY : 0; + flags |= op->map.is_null ? + VMA_CREATE_FLAG_IS_NULL : 0; + + vma = new_vma(vm, &op->base.map, op->map.pat_index, + flags); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + op->map.vma = vma; + break; + } + case DRM_GPUVA_OP_REMAP: + { + struct xe_vma *old = + gpuva_to_vma(op->base.remap.unmap->va); + + op->remap.start = xe_vma_start(old); + op->remap.range = xe_vma_size(old); + + if (op->base.remap.prev) { + flags |= op->base.remap.unmap->va->flags & + XE_VMA_READ_ONLY ? + VMA_CREATE_FLAG_READ_ONLY : 0; + flags |= op->base.remap.unmap->va->flags & + DRM_GPUVA_SPARSE ? + VMA_CREATE_FLAG_IS_NULL : 0; + + vma = new_vma(vm, op->base.remap.prev, + old->pat_index, flags); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + op->remap.prev = vma; + + /* + * Userptr creates a new SG mapping so + * we must also rebind. + */ + op->remap.skip_prev = !xe_vma_is_userptr(old) && + IS_ALIGNED(xe_vma_end(vma), + xe_vma_max_pte_size(old)); + if (op->remap.skip_prev) { + xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old)); + op->remap.range -= + xe_vma_end(vma) - + xe_vma_start(old); + op->remap.start = xe_vma_end(vma); + } + } + + if (op->base.remap.next) { + flags |= op->base.remap.unmap->va->flags & + XE_VMA_READ_ONLY ? + VMA_CREATE_FLAG_READ_ONLY : 0; + flags |= op->base.remap.unmap->va->flags & + DRM_GPUVA_SPARSE ? + VMA_CREATE_FLAG_IS_NULL : 0; + + vma = new_vma(vm, op->base.remap.next, + old->pat_index, flags); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + op->remap.next = vma; + + /* + * Userptr creates a new SG mapping so + * we must also rebind. + */ + op->remap.skip_next = !xe_vma_is_userptr(old) && + IS_ALIGNED(xe_vma_start(vma), + xe_vma_max_pte_size(old)); + if (op->remap.skip_next) { + xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old)); + op->remap.range -= + xe_vma_end(old) - + xe_vma_start(vma); + } + } + break; + } + case DRM_GPUVA_OP_UNMAP: + case DRM_GPUVA_OP_PREFETCH: + /* Nothing to do */ + break; + default: + drm_warn(&vm->xe->drm, "NOT POSSIBLE"); + } + + last_op = op; + + err = xe_vma_op_commit(vm, op); + if (err) + return err; + } + + /* FIXME: Unhandled corner case */ + XE_WARN_ON(!last_op && last && !list_empty(ops_list)); + + if (!last_op) + return 0; + + last_op->ops = ops; + if (last) { + last_op->flags |= XE_VMA_OP_LAST; + last_op->num_syncs = num_syncs; + last_op->syncs = syncs; + } + + return 0; +} + +static int op_execute(struct drm_exec *exec, struct xe_vm *vm, + struct xe_vma *vma, struct xe_vma_op *op) +{ + int err; + + lockdep_assert_held_write(&vm->lock); + + err = xe_vm_prepare_vma(exec, vma, 1); + if (err) + return err; + + xe_vm_assert_held(vm); + xe_bo_assert_held(xe_vma_bo(vma)); + + switch (op->base.op) { + case DRM_GPUVA_OP_MAP: + err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma), + op->syncs, op->num_syncs, + op->map.immediate || !xe_vm_in_fault_mode(vm), + op->flags & XE_VMA_OP_FIRST, + op->flags & XE_VMA_OP_LAST); + break; + case DRM_GPUVA_OP_REMAP: + { + bool prev = !!op->remap.prev; + bool next = !!op->remap.next; + + if (!op->remap.unmap_done) { + if (prev || next) + vma->gpuva.flags |= XE_VMA_FIRST_REBIND; + err = xe_vm_unbind(vm, vma, op->q, op->syncs, + op->num_syncs, + op->flags & XE_VMA_OP_FIRST, + op->flags & XE_VMA_OP_LAST && + !prev && !next); + if (err) + break; + op->remap.unmap_done = true; + } + + if (prev) { + op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND; + err = xe_vm_bind(vm, op->remap.prev, op->q, + xe_vma_bo(op->remap.prev), op->syncs, + op->num_syncs, true, false, + op->flags & XE_VMA_OP_LAST && !next); + op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND; + if (err) + break; + op->remap.prev = NULL; + } + + if (next) { + op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND; + err = xe_vm_bind(vm, op->remap.next, op->q, + xe_vma_bo(op->remap.next), + op->syncs, op->num_syncs, + true, false, + op->flags & XE_VMA_OP_LAST); + op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND; + if (err) + break; + op->remap.next = NULL; + } + + break; + } + case DRM_GPUVA_OP_UNMAP: + err = xe_vm_unbind(vm, vma, op->q, op->syncs, + op->num_syncs, op->flags & XE_VMA_OP_FIRST, + op->flags & XE_VMA_OP_LAST); + break; + case DRM_GPUVA_OP_PREFETCH: + err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region, + op->syncs, op->num_syncs, + op->flags & XE_VMA_OP_FIRST, + op->flags & XE_VMA_OP_LAST); + break; + default: + drm_warn(&vm->xe->drm, "NOT POSSIBLE"); + } + + if (err) + trace_xe_vma_fail(vma); + + return err; +} + +static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma, + struct xe_vma_op *op) +{ + struct drm_exec exec; + int err; + +retry_userptr: + drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); + drm_exec_until_all_locked(&exec) { + err = op_execute(&exec, vm, vma, op); + drm_exec_retry_on_contention(&exec); + if (err) + break; + } + drm_exec_fini(&exec); + + if (err == -EAGAIN && xe_vma_is_userptr(vma)) { + lockdep_assert_held_write(&vm->lock); + err = xe_vma_userptr_pin_pages(vma); + if (!err) + goto retry_userptr; + + trace_xe_vma_fail(vma); + } + + return err; +} + +static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op) +{ + int ret = 0; + + lockdep_assert_held_write(&vm->lock); + +#ifdef TEST_VM_ASYNC_OPS_ERROR + if (op->inject_error) { + op->inject_error = false; + return -ENOMEM; + } +#endif + + switch (op->base.op) { + case DRM_GPUVA_OP_MAP: + ret = __xe_vma_op_execute(vm, op->map.vma, op); + break; + case DRM_GPUVA_OP_REMAP: + { + struct xe_vma *vma; + + if (!op->remap.unmap_done) + vma = gpuva_to_vma(op->base.remap.unmap->va); + else if (op->remap.prev) + vma = op->remap.prev; + else + vma = op->remap.next; + + ret = __xe_vma_op_execute(vm, vma, op); + break; + } + case DRM_GPUVA_OP_UNMAP: + ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va), + op); + break; + case DRM_GPUVA_OP_PREFETCH: + ret = __xe_vma_op_execute(vm, + gpuva_to_vma(op->base.prefetch.va), + op); + break; + default: + drm_warn(&vm->xe->drm, "NOT POSSIBLE"); + } + + return ret; +} + +static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op) +{ + bool last = op->flags & XE_VMA_OP_LAST; + + if (last) { + while (op->num_syncs--) + xe_sync_entry_cleanup(&op->syncs[op->num_syncs]); + kfree(op->syncs); + if (op->q) + xe_exec_queue_put(op->q); + } + if (!list_empty(&op->link)) + list_del(&op->link); + if (op->ops) + drm_gpuva_ops_free(&vm->gpuvm, op->ops); + if (last) + xe_vm_put(vm); +} + +static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op, + bool post_commit, bool prev_post_commit, + bool next_post_commit) +{ + lockdep_assert_held_write(&vm->lock); + + switch (op->base.op) { + case DRM_GPUVA_OP_MAP: + if (op->map.vma) { + prep_vma_destroy(vm, op->map.vma, post_commit); + xe_vma_destroy_unlocked(op->map.vma); + } + break; + case DRM_GPUVA_OP_UNMAP: + { + struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va); + + if (vma) { + down_read(&vm->userptr.notifier_lock); + vma->gpuva.flags &= ~XE_VMA_DESTROYED; + up_read(&vm->userptr.notifier_lock); + if (post_commit) + xe_vm_insert_vma(vm, vma); + } + break; + } + case DRM_GPUVA_OP_REMAP: + { + struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va); + + if (op->remap.prev) { + prep_vma_destroy(vm, op->remap.prev, prev_post_commit); + xe_vma_destroy_unlocked(op->remap.prev); + } + if (op->remap.next) { + prep_vma_destroy(vm, op->remap.next, next_post_commit); + xe_vma_destroy_unlocked(op->remap.next); + } + if (vma) { + down_read(&vm->userptr.notifier_lock); + vma->gpuva.flags &= ~XE_VMA_DESTROYED; + up_read(&vm->userptr.notifier_lock); + if (post_commit) + xe_vm_insert_vma(vm, vma); + } + break; + } + case DRM_GPUVA_OP_PREFETCH: + /* Nothing to do */ + break; + default: + drm_warn(&vm->xe->drm, "NOT POSSIBLE"); + } +} + +static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm, + struct drm_gpuva_ops **ops, + int num_ops_list) +{ + int i; + + for (i = num_ops_list - 1; i; ++i) { + struct drm_gpuva_ops *__ops = ops[i]; + struct drm_gpuva_op *__op; + + if (!__ops) + continue; + + drm_gpuva_for_each_op_reverse(__op, __ops) { + struct xe_vma_op *op = gpuva_op_to_vma_op(__op); + + xe_vma_op_unwind(vm, op, + op->flags & XE_VMA_OP_COMMITTED, + op->flags & XE_VMA_OP_PREV_COMMITTED, + op->flags & XE_VMA_OP_NEXT_COMMITTED); + } + + drm_gpuva_ops_free(&vm->gpuvm, __ops); + } +} + +static int vm_bind_ioctl_ops_execute(struct xe_vm *vm, + struct list_head *ops_list) +{ + struct xe_vma_op *op, *next; + int err; + + lockdep_assert_held_write(&vm->lock); + + list_for_each_entry_safe(op, next, ops_list, link) { + err = xe_vma_op_execute(vm, op); + if (err) { + drm_warn(&vm->xe->drm, "VM op(%d) failed with %d", + op->base.op, err); + /* + * FIXME: Killing VM rather than proper error handling + */ + xe_vm_kill(vm); + return -ENOSPC; + } + xe_vma_op_cleanup(vm, op); + } + + return 0; +} + +#ifdef TEST_VM_ASYNC_OPS_ERROR +#define SUPPORTED_FLAGS \ + (FORCE_ASYNC_OP_ERROR | DRM_XE_VM_BIND_FLAG_READONLY | \ + DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | 0xffff) +#else +#define SUPPORTED_FLAGS \ + (DRM_XE_VM_BIND_FLAG_READONLY | \ + DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \ + 0xffff) +#endif +#define XE_64K_PAGE_MASK 0xffffull +#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP) + +#define MAX_BINDS 512 /* FIXME: Picking random upper limit */ + +static int vm_bind_ioctl_check_args(struct xe_device *xe, + struct drm_xe_vm_bind *args, + struct drm_xe_vm_bind_op **bind_ops) +{ + int err; + int i; + + if (XE_IOCTL_DBG(xe, args->pad || args->pad2) || + XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->extensions) || + XE_IOCTL_DBG(xe, args->num_binds > MAX_BINDS)) + return -EINVAL; + + if (args->num_binds > 1) { + u64 __user *bind_user = + u64_to_user_ptr(args->vector_of_binds); + + *bind_ops = kmalloc(sizeof(struct drm_xe_vm_bind_op) * + args->num_binds, GFP_KERNEL); + if (!*bind_ops) + return -ENOMEM; + + err = __copy_from_user(*bind_ops, bind_user, + sizeof(struct drm_xe_vm_bind_op) * + args->num_binds); + if (XE_IOCTL_DBG(xe, err)) { + err = -EFAULT; + goto free_bind_ops; + } + } else { + *bind_ops = &args->bind; + } + + for (i = 0; i < args->num_binds; ++i) { + u64 range = (*bind_ops)[i].range; + u64 addr = (*bind_ops)[i].addr; + u32 op = (*bind_ops)[i].op; + u32 flags = (*bind_ops)[i].flags; + u32 obj = (*bind_ops)[i].obj; + u64 obj_offset = (*bind_ops)[i].obj_offset; + u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance; + bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL; + u16 pat_index = (*bind_ops)[i].pat_index; + u16 coh_mode; + + if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) { + err = -EINVAL; + goto free_bind_ops; + } + + pat_index = array_index_nospec(pat_index, xe->pat.n_entries); + (*bind_ops)[i].pat_index = pat_index; + coh_mode = xe_pat_index_get_coh_mode(xe, pat_index); + if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */ + err = -EINVAL; + goto free_bind_ops; + } + + if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) { + err = -EINVAL; + goto free_bind_ops; + } + + if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) || + XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) || + XE_IOCTL_DBG(xe, obj && is_null) || + XE_IOCTL_DBG(xe, obj_offset && is_null) || + XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP && + is_null) || + XE_IOCTL_DBG(xe, !obj && + op == DRM_XE_VM_BIND_OP_MAP && + !is_null) || + XE_IOCTL_DBG(xe, !obj && + op == DRM_XE_VM_BIND_OP_UNMAP_ALL) || + XE_IOCTL_DBG(xe, addr && + op == DRM_XE_VM_BIND_OP_UNMAP_ALL) || + XE_IOCTL_DBG(xe, range && + op == DRM_XE_VM_BIND_OP_UNMAP_ALL) || + XE_IOCTL_DBG(xe, obj && + op == DRM_XE_VM_BIND_OP_MAP_USERPTR) || + XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE && + op == DRM_XE_VM_BIND_OP_MAP_USERPTR) || + XE_IOCTL_DBG(xe, obj && + op == DRM_XE_VM_BIND_OP_PREFETCH) || + XE_IOCTL_DBG(xe, prefetch_region && + op != DRM_XE_VM_BIND_OP_PREFETCH) || + XE_IOCTL_DBG(xe, !(BIT(prefetch_region) & + xe->info.mem_region_mask)) || + XE_IOCTL_DBG(xe, obj && + op == DRM_XE_VM_BIND_OP_UNMAP)) { + err = -EINVAL; + goto free_bind_ops; + } + + if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) || + XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) || + XE_IOCTL_DBG(xe, range & ~PAGE_MASK) || + XE_IOCTL_DBG(xe, !range && + op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) { + err = -EINVAL; + goto free_bind_ops; + } + } + + return 0; + +free_bind_ops: + if (args->num_binds > 1) + kfree(*bind_ops); + return err; +} + +static int vm_bind_ioctl_signal_fences(struct xe_vm *vm, + struct xe_exec_queue *q, + struct xe_sync_entry *syncs, + int num_syncs) +{ + struct dma_fence *fence; + int i, err = 0; + + fence = xe_sync_in_fence_get(syncs, num_syncs, + to_wait_exec_queue(vm, q), vm); + if (IS_ERR(fence)) + return PTR_ERR(fence); + + for (i = 0; i < num_syncs; i++) + xe_sync_entry_signal(&syncs[i], NULL, fence); + + xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm, + fence); + dma_fence_put(fence); + + return err; +} + +int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = to_xe_file(file); + struct drm_xe_vm_bind *args = data; + struct drm_xe_sync __user *syncs_user; + struct xe_bo **bos = NULL; + struct drm_gpuva_ops **ops = NULL; + struct xe_vm *vm; + struct xe_exec_queue *q = NULL; + u32 num_syncs; + struct xe_sync_entry *syncs = NULL; + struct drm_xe_vm_bind_op *bind_ops; + LIST_HEAD(ops_list); + int err; + int i; + + err = vm_bind_ioctl_check_args(xe, args, &bind_ops); + if (err) + return err; + + if (args->exec_queue_id) { + q = xe_exec_queue_lookup(xef, args->exec_queue_id); + if (XE_IOCTL_DBG(xe, !q)) { + err = -ENOENT; + goto free_objs; + } + + if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) { + err = -EINVAL; + goto put_exec_queue; + } + } + + vm = xe_vm_lookup(xef, args->vm_id); + if (XE_IOCTL_DBG(xe, !vm)) { + err = -EINVAL; + goto put_exec_queue; + } + + err = down_write_killable(&vm->lock); + if (err) + goto put_vm; + + if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) { + err = -ENOENT; + goto release_vm_lock; + } + + for (i = 0; i < args->num_binds; ++i) { + u64 range = bind_ops[i].range; + u64 addr = bind_ops[i].addr; + + if (XE_IOCTL_DBG(xe, range > vm->size) || + XE_IOCTL_DBG(xe, addr > vm->size - range)) { + err = -EINVAL; + goto release_vm_lock; + } + } + + if (args->num_binds) { + bos = kcalloc(args->num_binds, sizeof(*bos), GFP_KERNEL); + if (!bos) { + err = -ENOMEM; + goto release_vm_lock; + } + + ops = kcalloc(args->num_binds, sizeof(*ops), GFP_KERNEL); + if (!ops) { + err = -ENOMEM; + goto release_vm_lock; + } + } + + for (i = 0; i < args->num_binds; ++i) { + struct drm_gem_object *gem_obj; + u64 range = bind_ops[i].range; + u64 addr = bind_ops[i].addr; + u32 obj = bind_ops[i].obj; + u64 obj_offset = bind_ops[i].obj_offset; + u16 pat_index = bind_ops[i].pat_index; + u16 coh_mode; + + if (!obj) + continue; + + gem_obj = drm_gem_object_lookup(file, obj); + if (XE_IOCTL_DBG(xe, !gem_obj)) { + err = -ENOENT; + goto put_obj; + } + bos[i] = gem_to_xe_bo(gem_obj); + + if (XE_IOCTL_DBG(xe, range > bos[i]->size) || + XE_IOCTL_DBG(xe, obj_offset > + bos[i]->size - range)) { + err = -EINVAL; + goto put_obj; + } + + if (bos[i]->flags & XE_BO_INTERNAL_64K) { + if (XE_IOCTL_DBG(xe, obj_offset & + XE_64K_PAGE_MASK) || + XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) || + XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) { + err = -EINVAL; + goto put_obj; + } + } + + coh_mode = xe_pat_index_get_coh_mode(xe, pat_index); + if (bos[i]->cpu_caching) { + if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE && + bos[i]->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) { + err = -EINVAL; + goto put_obj; + } + } else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) { + /* + * Imported dma-buf from a different device should + * require 1way or 2way coherency since we don't know + * how it was mapped on the CPU. Just assume is it + * potentially cached on CPU side. + */ + err = -EINVAL; + goto put_obj; + } + } + + if (args->num_syncs) { + syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL); + if (!syncs) { + err = -ENOMEM; + goto put_obj; + } + } + + syncs_user = u64_to_user_ptr(args->syncs); + for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) { + err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs], + &syncs_user[num_syncs], + (xe_vm_in_lr_mode(vm) ? + SYNC_PARSE_FLAG_LR_MODE : 0) | + (!args->num_binds ? + SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0)); + if (err) + goto free_syncs; + } + + if (!args->num_binds) { + err = -ENODATA; + goto free_syncs; + } + + for (i = 0; i < args->num_binds; ++i) { + u64 range = bind_ops[i].range; + u64 addr = bind_ops[i].addr; + u32 op = bind_ops[i].op; + u32 flags = bind_ops[i].flags; + u64 obj_offset = bind_ops[i].obj_offset; + u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance; + u16 pat_index = bind_ops[i].pat_index; + + ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset, + addr, range, op, flags, + prefetch_region, pat_index); + if (IS_ERR(ops[i])) { + err = PTR_ERR(ops[i]); + ops[i] = NULL; + goto unwind_ops; + } + + err = vm_bind_ioctl_ops_parse(vm, q, ops[i], syncs, num_syncs, + &ops_list, + i == args->num_binds - 1); + if (err) + goto unwind_ops; + } + + /* Nothing to do */ + if (list_empty(&ops_list)) { + err = -ENODATA; + goto unwind_ops; + } + + xe_vm_get(vm); + if (q) + xe_exec_queue_get(q); + + err = vm_bind_ioctl_ops_execute(vm, &ops_list); + + up_write(&vm->lock); + + if (q) + xe_exec_queue_put(q); + xe_vm_put(vm); + + for (i = 0; bos && i < args->num_binds; ++i) + xe_bo_put(bos[i]); + + kfree(bos); + kfree(ops); + if (args->num_binds > 1) + kfree(bind_ops); + + return err; + +unwind_ops: + vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds); +free_syncs: + if (err == -ENODATA) + err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs); + while (num_syncs--) + xe_sync_entry_cleanup(&syncs[num_syncs]); + + kfree(syncs); +put_obj: + for (i = 0; i < args->num_binds; ++i) + xe_bo_put(bos[i]); +release_vm_lock: + up_write(&vm->lock); +put_vm: + xe_vm_put(vm); +put_exec_queue: + if (q) + xe_exec_queue_put(q); +free_objs: + kfree(bos); + kfree(ops); + if (args->num_binds > 1) + kfree(bind_ops); + return err; +} + +/** + * xe_vm_lock() - Lock the vm's dma_resv object + * @vm: The struct xe_vm whose lock is to be locked + * @intr: Whether to perform any wait interruptible + * + * Return: 0 on success, -EINTR if @intr is true and the wait for a + * contended lock was interrupted. If @intr is false, the function + * always returns 0. + */ +int xe_vm_lock(struct xe_vm *vm, bool intr) +{ + if (intr) + return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL); + + return dma_resv_lock(xe_vm_resv(vm), NULL); +} + +/** + * xe_vm_unlock() - Unlock the vm's dma_resv object + * @vm: The struct xe_vm whose lock is to be released. + * + * Unlock a buffer object lock that was locked by xe_vm_lock(). + */ +void xe_vm_unlock(struct xe_vm *vm) +{ + dma_resv_unlock(xe_vm_resv(vm)); +} + +/** + * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock + * @vma: VMA to invalidate + * + * Walks a list of page tables leaves which it memset the entries owned by this + * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is + * complete. + * + * Returns 0 for success, negative error code otherwise. + */ +int xe_vm_invalidate_vma(struct xe_vma *vma) +{ + struct xe_device *xe = xe_vma_vm(vma)->xe; + struct xe_tile *tile; + u32 tile_needs_invalidate = 0; + int seqno[XE_MAX_TILES_PER_DEVICE]; + u8 id; + int ret; + + xe_assert(xe, xe_vm_in_fault_mode(xe_vma_vm(vma))); + xe_assert(xe, !xe_vma_is_null(vma)); + trace_xe_vma_usm_invalidate(vma); + + /* Check that we don't race with page-table updates */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { + if (xe_vma_is_userptr(vma)) { + WARN_ON_ONCE(!mmu_interval_check_retry + (&vma->userptr.notifier, + vma->userptr.notifier_seq)); + WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)), + DMA_RESV_USAGE_BOOKKEEP)); + + } else { + xe_bo_assert_held(xe_vma_bo(vma)); + } + } + + for_each_tile(tile, xe, id) { + if (xe_pt_zap_ptes(tile, vma)) { + tile_needs_invalidate |= BIT(id); + xe_device_wmb(xe); + /* + * FIXME: We potentially need to invalidate multiple + * GTs within the tile + */ + seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma); + if (seqno[id] < 0) + return seqno[id]; + } + } + + for_each_tile(tile, xe, id) { + if (tile_needs_invalidate & BIT(id)) { + ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]); + if (ret < 0) + return ret; + } + } + + vma->usm.tile_invalidated = vma->tile_mask; + + return 0; +} + +int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id) +{ + struct drm_gpuva *gpuva; + bool is_vram; + uint64_t addr; + + if (!down_read_trylock(&vm->lock)) { + drm_printf(p, " Failed to acquire VM lock to dump capture"); + return 0; + } + if (vm->pt_root[gt_id]) { + addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE); + is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo); + drm_printf(p, " VM root: A:0x%llx %s\n", addr, + is_vram ? "VRAM" : "SYS"); + } + + drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) { + struct xe_vma *vma = gpuva_to_vma(gpuva); + bool is_userptr = xe_vma_is_userptr(vma); + bool is_null = xe_vma_is_null(vma); + + if (is_null) { + addr = 0; + } else if (is_userptr) { + struct xe_res_cursor cur; + + if (vma->userptr.sg) { + xe_res_first_sg(vma->userptr.sg, 0, XE_PAGE_SIZE, + &cur); + addr = xe_res_dma(&cur); + } else { + addr = 0; + } + } else { + addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE); + is_vram = xe_bo_is_vram(xe_vma_bo(vma)); + } + drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n", + xe_vma_start(vma), xe_vma_end(vma) - 1, + xe_vma_size(vma), + addr, is_null ? "NULL" : is_userptr ? "USR" : + is_vram ? "VRAM" : "SYS"); + } + up_read(&vm->lock); + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h new file mode 100644 index 000000000000..cf2f96e8c1ab --- /dev/null +++ b/drivers/gpu/drm/xe/xe_vm.h @@ -0,0 +1,263 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_VM_H_ +#define _XE_VM_H_ + +#include "xe_bo_types.h" +#include "xe_macros.h" +#include "xe_map.h" +#include "xe_vm_types.h" + +struct drm_device; +struct drm_printer; +struct drm_file; + +struct ttm_buffer_object; +struct ttm_validate_buffer; + +struct xe_exec_queue; +struct xe_file; +struct xe_sync_entry; +struct drm_exec; + +struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags); + +struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id); +int xe_vma_cmp_vma_cb(const void *key, const struct rb_node *node); + +static inline struct xe_vm *xe_vm_get(struct xe_vm *vm) +{ + drm_gpuvm_get(&vm->gpuvm); + return vm; +} + +static inline void xe_vm_put(struct xe_vm *vm) +{ + drm_gpuvm_put(&vm->gpuvm); +} + +int xe_vm_lock(struct xe_vm *vm, bool intr); + +void xe_vm_unlock(struct xe_vm *vm); + +static inline bool xe_vm_is_closed(struct xe_vm *vm) +{ + /* Only guaranteed not to change when vm->lock is held */ + return !vm->size; +} + +static inline bool xe_vm_is_banned(struct xe_vm *vm) +{ + return vm->flags & XE_VM_FLAG_BANNED; +} + +static inline bool xe_vm_is_closed_or_banned(struct xe_vm *vm) +{ + lockdep_assert_held(&vm->lock); + return xe_vm_is_closed(vm) || xe_vm_is_banned(vm); +} + +struct xe_vma * +xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range); + +/** + * xe_vm_has_scratch() - Whether the vm is configured for scratch PTEs + * @vm: The vm + * + * Return: whether the vm populates unmapped areas with scratch PTEs + */ +static inline bool xe_vm_has_scratch(const struct xe_vm *vm) +{ + return vm->flags & XE_VM_FLAG_SCRATCH_PAGE; +} + +/** + * gpuvm_to_vm() - Return the embedding xe_vm from a struct drm_gpuvm pointer + * @gpuvm: The struct drm_gpuvm pointer + * + * Return: Pointer to the embedding struct xe_vm. + */ +static inline struct xe_vm *gpuvm_to_vm(struct drm_gpuvm *gpuvm) +{ + return container_of(gpuvm, struct xe_vm, gpuvm); +} + +static inline struct xe_vm *gpuva_to_vm(struct drm_gpuva *gpuva) +{ + return gpuvm_to_vm(gpuva->vm); +} + +static inline struct xe_vma *gpuva_to_vma(struct drm_gpuva *gpuva) +{ + return container_of(gpuva, struct xe_vma, gpuva); +} + +static inline struct xe_vma_op *gpuva_op_to_vma_op(struct drm_gpuva_op *op) +{ + return container_of(op, struct xe_vma_op, base); +} + +/** + * DOC: Provide accessors for vma members to facilitate easy change of + * implementation. + */ +static inline u64 xe_vma_start(struct xe_vma *vma) +{ + return vma->gpuva.va.addr; +} + +static inline u64 xe_vma_size(struct xe_vma *vma) +{ + return vma->gpuva.va.range; +} + +static inline u64 xe_vma_end(struct xe_vma *vma) +{ + return xe_vma_start(vma) + xe_vma_size(vma); +} + +static inline u64 xe_vma_bo_offset(struct xe_vma *vma) +{ + return vma->gpuva.gem.offset; +} + +static inline struct xe_bo *xe_vma_bo(struct xe_vma *vma) +{ + return !vma->gpuva.gem.obj ? NULL : + container_of(vma->gpuva.gem.obj, struct xe_bo, ttm.base); +} + +static inline struct xe_vm *xe_vma_vm(struct xe_vma *vma) +{ + return container_of(vma->gpuva.vm, struct xe_vm, gpuvm); +} + +static inline bool xe_vma_read_only(struct xe_vma *vma) +{ + return vma->gpuva.flags & XE_VMA_READ_ONLY; +} + +static inline u64 xe_vma_userptr(struct xe_vma *vma) +{ + return vma->gpuva.gem.offset; +} + +static inline bool xe_vma_is_null(struct xe_vma *vma) +{ + return vma->gpuva.flags & DRM_GPUVA_SPARSE; +} + +static inline bool xe_vma_has_no_bo(struct xe_vma *vma) +{ + return !xe_vma_bo(vma); +} + +static inline bool xe_vma_is_userptr(struct xe_vma *vma) +{ + return xe_vma_has_no_bo(vma) && !xe_vma_is_null(vma); +} + +u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile); + +int xe_vm_create_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +int xe_vm_destroy_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +int xe_vm_bind_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); + +void xe_vm_close_and_put(struct xe_vm *vm); + +static inline bool xe_vm_in_fault_mode(struct xe_vm *vm) +{ + return vm->flags & XE_VM_FLAG_FAULT_MODE; +} + +static inline bool xe_vm_in_lr_mode(struct xe_vm *vm) +{ + return vm->flags & XE_VM_FLAG_LR_MODE; +} + +static inline bool xe_vm_in_preempt_fence_mode(struct xe_vm *vm) +{ + return xe_vm_in_lr_mode(vm) && !xe_vm_in_fault_mode(vm); +} + +int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q); +void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q); + +int xe_vm_userptr_pin(struct xe_vm *vm); + +int __xe_vm_userptr_needs_repin(struct xe_vm *vm); + +int xe_vm_userptr_check_repin(struct xe_vm *vm); + +struct dma_fence *xe_vm_rebind(struct xe_vm *vm, bool rebind_worker); + +int xe_vm_invalidate_vma(struct xe_vma *vma); + +extern struct ttm_device_funcs xe_ttm_funcs; + +static inline void xe_vm_queue_rebind_worker(struct xe_vm *vm) +{ + xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm)); + queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work); +} + +/** + * xe_vm_reactivate_rebind() - Reactivate the rebind functionality on compute + * vms. + * @vm: The vm. + * + * If the rebind functionality on a compute vm was disabled due + * to nothing to execute. Reactivate it and run the rebind worker. + * This function should be called after submitting a batch to a compute vm. + */ +static inline void xe_vm_reactivate_rebind(struct xe_vm *vm) +{ + if (xe_vm_in_preempt_fence_mode(vm) && vm->preempt.rebind_deactivated) { + vm->preempt.rebind_deactivated = false; + xe_vm_queue_rebind_worker(vm); + } +} + +int xe_vma_userptr_pin_pages(struct xe_vma *vma); + +int xe_vma_userptr_check_repin(struct xe_vma *vma); + +bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end); + +int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id); + +int xe_vm_prepare_vma(struct drm_exec *exec, struct xe_vma *vma, + unsigned int num_shared); + +/** + * xe_vm_resv() - Return's the vm's reservation object + * @vm: The vm + * + * Return: Pointer to the vm's reservation object. + */ +static inline struct dma_resv *xe_vm_resv(struct xe_vm *vm) +{ + return drm_gpuvm_resv(&vm->gpuvm); +} + +/** + * xe_vm_assert_held(vm) - Assert that the vm's reservation object is held. + * @vm: The vm + */ +#define xe_vm_assert_held(vm) dma_resv_assert_held(xe_vm_resv(vm)) + +#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM) +#define vm_dbg drm_dbg +#else +__printf(2, 3) +static inline void vm_dbg(const struct drm_device *dev, + const char *format, ...) +{ /* noop */ } +#endif +#endif diff --git a/drivers/gpu/drm/xe/xe_vm_doc.h b/drivers/gpu/drm/xe/xe_vm_doc.h new file mode 100644 index 000000000000..bdc6659891a5 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_vm_doc.h @@ -0,0 +1,555 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_VM_DOC_H_ +#define _XE_VM_DOC_H_ + +/** + * DOC: XE VM (user address space) + * + * VM creation + * =========== + * + * Allocate a physical page for root of the page table structure, create default + * bind engine, and return a handle to the user. + * + * Scratch page + * ------------ + * + * If the VM is created with the flag, DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE, set the + * entire page table structure defaults pointing to blank page allocated by the + * VM. Invalid memory access rather than fault just read / write to this page. + * + * VM bind (create GPU mapping for a BO or userptr) + * ================================================ + * + * Creates GPU mapings for a BO or userptr within a VM. VM binds uses the same + * in / out fence interface (struct drm_xe_sync) as execs which allows users to + * think of binds and execs as more or less the same operation. + * + * Operations + * ---------- + * + * DRM_XE_VM_BIND_OP_MAP - Create mapping for a BO + * DRM_XE_VM_BIND_OP_UNMAP - Destroy mapping for a BO / userptr + * DRM_XE_VM_BIND_OP_MAP_USERPTR - Create mapping for userptr + * + * Implementation details + * ~~~~~~~~~~~~~~~~~~~~~~ + * + * All bind operations are implemented via a hybrid approach of using the CPU + * and GPU to modify page tables. If a new physical page is allocated in the + * page table structure we populate that page via the CPU and insert that new + * page into the existing page table structure via a GPU job. Also any existing + * pages in the page table structure that need to be modified also are updated + * via the GPU job. As the root physical page is prealloced on VM creation our + * GPU job will always have at least 1 update. The in / out fences are passed to + * this job so again this is conceptually the same as an exec. + * + * Very simple example of few binds on an empty VM with 48 bits of address space + * and the resulting operations: + * + * .. code-block:: + * + * bind BO0 0x0-0x1000 + * alloc page level 3a, program PTE[0] to BO0 phys address (CPU) + * alloc page level 2, program PDE[0] page level 3a phys address (CPU) + * alloc page level 1, program PDE[0] page level 2 phys address (CPU) + * update root PDE[0] to page level 1 phys address (GPU) + * + * bind BO1 0x201000-0x202000 + * alloc page level 3b, program PTE[1] to BO1 phys address (CPU) + * update page level 2 PDE[1] to page level 3b phys address (GPU) + * + * bind BO2 0x1ff000-0x201000 + * update page level 3a PTE[511] to BO2 phys addres (GPU) + * update page level 3b PTE[0] to BO2 phys addres + 0x1000 (GPU) + * + * GPU bypass + * ~~~~~~~~~~ + * + * In the above example the steps using the GPU can be converted to CPU if the + * bind can be done immediately (all in-fences satisfied, VM dma-resv kernel + * slot is idle). + * + * Address space + * ------------- + * + * Depending on platform either 48 or 57 bits of address space is supported. + * + * Page sizes + * ---------- + * + * The minimum page size is either 4k or 64k depending on platform and memory + * placement (sysmem vs. VRAM). We enforce that binds must be aligned to the + * minimum page size. + * + * Larger pages (2M or 1GB) can be used for BOs in VRAM, the BO physical address + * is aligned to the larger pages size, and VA is aligned to the larger page + * size. Larger pages for userptrs / BOs in sysmem should be possible but is not + * yet implemented. + * + * Sync error handling mode + * ------------------------ + * + * In both modes during the bind IOCTL the user input is validated. In sync + * error handling mode the newly bound BO is validated (potentially moved back + * to a region of memory where is can be used), page tables are updated by the + * CPU and the job to do the GPU binds is created in the IOCTL itself. This step + * can fail due to memory pressure. The user can recover by freeing memory and + * trying this operation again. + * + * Async error handling mode + * ------------------------- + * + * In async error handling the step of validating the BO, updating page tables, + * and generating a job are deferred to an async worker. As this step can now + * fail after the IOCTL has reported success we need an error handling flow for + * which the user can recover from. + * + * The solution is for a user to register a user address with the VM which the + * VM uses to report errors to. The ufence wait interface can be used to wait on + * a VM going into an error state. Once an error is reported the VM's async + * worker is paused. While the VM's async worker is paused sync, + * DRM_XE_VM_BIND_OP_UNMAP operations are allowed (this can free memory). Once the + * uses believe the error state is fixed, the async worker can be resumed via + * XE_VM_BIND_OP_RESTART operation. When VM async bind work is restarted, the + * first operation processed is the operation that caused the original error. + * + * Bind queues / engines + * --------------------- + * + * Think of the case where we have two bind operations A + B and are submitted + * in that order. A has in fences while B has none. If using a single bind + * queue, B is now blocked on A's in fences even though it is ready to run. This + * example is a real use case for VK sparse binding. We work around this + * limitation by implementing bind engines. + * + * In the bind IOCTL the user can optionally pass in an engine ID which must map + * to an engine which is of the special class DRM_XE_ENGINE_CLASS_VM_BIND. + * Underneath this is a really virtual engine that can run on any of the copy + * hardware engines. The job(s) created each IOCTL are inserted into this + * engine's ring. In the example above if A and B have different bind engines B + * is free to pass A. If the engine ID field is omitted, the default bind queue + * for the VM is used. + * + * TODO: Explain race in issue 41 and how we solve it + * + * Array of bind operations + * ------------------------ + * + * The uAPI allows multiple binds operations to be passed in via a user array, + * of struct drm_xe_vm_bind_op, in a single VM bind IOCTL. This interface + * matches the VK sparse binding API. The implementation is rather simple, parse + * the array into a list of operations, pass the in fences to the first operation, + * and pass the out fences to the last operation. The ordered nature of a bind + * engine makes this possible. + * + * Munmap semantics for unbinds + * ---------------------------- + * + * Munmap allows things like: + * + * .. code-block:: + * + * 0x0000-0x2000 and 0x3000-0x5000 have mappings + * Munmap 0x1000-0x4000, results in mappings 0x0000-0x1000 and 0x4000-0x5000 + * + * To support this semantic in the above example we decompose the above example + * into 4 operations: + * + * .. code-block:: + * + * unbind 0x0000-0x2000 + * unbind 0x3000-0x5000 + * rebind 0x0000-0x1000 + * rebind 0x4000-0x5000 + * + * Why not just do a partial unbind of 0x1000-0x2000 and 0x3000-0x4000? This + * falls apart when using large pages at the edges and the unbind forces us to + * use a smaller page size. For simplity we always issue a set of unbinds + * unmapping anything in the range and at most 2 rebinds on the edges. + * + * Similar to an array of binds, in fences are passed to the first operation and + * out fences are signaled on the last operation. + * + * In this example there is a window of time where 0x0000-0x1000 and + * 0x4000-0x5000 are invalid but the user didn't ask for these addresses to be + * removed from the mapping. To work around this we treat any munmap style + * unbinds which require a rebind as a kernel operations (BO eviction or userptr + * invalidation). The first operation waits on the VM's + * DMA_RESV_USAGE_PREEMPT_FENCE slots (waits for all pending jobs on VM to + * complete / triggers preempt fences) and the last operation is installed in + * the VM's DMA_RESV_USAGE_KERNEL slot (blocks future jobs / resume compute mode + * VM). The caveat is all dma-resv slots must be updated atomically with respect + * to execs and compute mode rebind worker. To accomplish this, hold the + * vm->lock in write mode from the first operation until the last. + * + * Deferred binds in fault mode + * ---------------------------- + * + * In a VM is in fault mode (TODO: link to fault mode), new bind operations that + * create mappings are by default are deferred to the page fault handler (first + * use). This behavior can be overriden by setting the flag + * DRM_XE_VM_BIND_FLAG_IMMEDIATE which indicates to creating the mapping + * immediately. + * + * User pointer + * ============ + * + * User pointers are user allocated memory (malloc'd, mmap'd, etc..) for which the + * user wants to create a GPU mapping. Typically in other DRM drivers a dummy BO + * was created and then a binding was created. We bypass creating a dummy BO in + * XE and simply create a binding directly from the userptr. + * + * Invalidation + * ------------ + * + * Since this a core kernel managed memory the kernel can move this memory + * whenever it wants. We register an invalidation MMU notifier to alert XE when + * a user poiter is about to move. The invalidation notifier needs to block + * until all pending users (jobs or compute mode engines) of the userptr are + * idle to ensure no faults. This done by waiting on all of VM's dma-resv slots. + * + * Rebinds + * ------- + * + * Either the next exec (non-compute) or rebind worker (compute mode) will + * rebind the userptr. The invalidation MMU notifier kicks the rebind worker + * after the VM dma-resv wait if the VM is in compute mode. + * + * Compute mode + * ============ + * + * A VM in compute mode enables long running workloads and ultra low latency + * submission (ULLS). ULLS is implemented via a continuously running batch + + * semaphores. This enables to the user to insert jump to new batch commands + * into the continuously running batch. In both cases these batches exceed the + * time a dma fence is allowed to exist for before signaling, as such dma fences + * are not used when a VM is in compute mode. User fences (TODO: link user fence + * doc) are used instead to signal operation's completion. + * + * Preempt fences + * -------------- + * + * If the kernel decides to move memory around (either userptr invalidate, BO + * eviction, or mumap style unbind which results in a rebind) and a batch is + * running on an engine, that batch can fault or cause a memory corruption as + * page tables for the moved memory are no longer valid. To work around this we + * introduce the concept of preempt fences. When sw signaling is enabled on a + * preempt fence it tells the submission backend to kick that engine off the + * hardware and the preempt fence signals when the engine is off the hardware. + * Once all preempt fences are signaled for a VM the kernel can safely move the + * memory and kick the rebind worker which resumes all the engines execution. + * + * A preempt fence, for every engine using the VM, is installed the VM's + * dma-resv DMA_RESV_USAGE_PREEMPT_FENCE slot. The same preempt fence, for every + * engine using the VM, is also installed into the same dma-resv slot of every + * external BO mapped in the VM. + * + * Rebind worker + * ------------- + * + * The rebind worker is very similar to an exec. It is resposible for rebinding + * evicted BOs or userptrs, waiting on those operations, installing new preempt + * fences, and finally resuming executing of engines in the VM. + * + * Flow + * ~~~~ + * + * .. code-block:: + * + * <----------------------------------------------------------------------| + * Check if VM is closed, if so bail out | + * Lock VM global lock in read mode | + * Pin userptrs (also finds userptr invalidated since last rebind worker) | + * Lock VM dma-resv and external BOs dma-resv | + * Validate BOs that have been evicted | + * Wait on and allocate new preempt fences for every engine using the VM | + * Rebind invalidated userptrs + evicted BOs | + * Wait on last rebind fence | + * Wait VM's DMA_RESV_USAGE_KERNEL dma-resv slot | + * Install preeempt fences and issue resume for every engine using the VM | + * Check if any userptrs invalidated since pin | + * Squash resume for all engines | + * Unlock all | + * Wait all VM's dma-resv slots | + * Retry ---------------------------------------------------------- + * Release all engines waiting to resume + * Unlock all + * + * Timeslicing + * ----------- + * + * In order to prevent an engine from continuously being kicked off the hardware + * and making no forward progress an engine has a period of time it allowed to + * run after resume before it can be kicked off again. This effectively gives + * each engine a timeslice. + * + * Handling multiple GTs + * ===================== + * + * If a GT has slower access to some regions and the page table structure are in + * the slow region, the performance on that GT could adversely be affected. To + * work around this we allow a VM page tables to be shadowed in multiple GTs. + * When VM is created, a default bind engine and PT table structure are created + * on each GT. + * + * Binds can optionally pass in a mask of GTs where a mapping should be created, + * if this mask is zero then default to all the GTs where the VM has page + * tables. + * + * The implementation for this breaks down into a bunch for_each_gt loops in + * various places plus exporting a composite fence for multi-GT binds to the + * user. + * + * Fault mode (unified shared memory) + * ================================== + * + * A VM in fault mode can be enabled on devices that support page faults. If + * page faults are enabled, using dma fences can potentially induce a deadlock: + * A pending page fault can hold up the GPU work which holds up the dma fence + * signaling, and memory allocation is usually required to resolve a page + * fault, but memory allocation is not allowed to gate dma fence signaling. As + * such, dma fences are not allowed when VM is in fault mode. Because dma-fences + * are not allowed, long running workloads and ULLS are enabled on a faulting + * VM. + * + * Defered VM binds + * ---------------- + * + * By default, on a faulting VM binds just allocate the VMA and the actual + * updating of the page tables is defered to the page fault handler. This + * behavior can be overridden by setting the flag DRM_XE_VM_BIND_FLAG_IMMEDIATE in + * the VM bind which will then do the bind immediately. + * + * Page fault handler + * ------------------ + * + * Page faults are received in the G2H worker under the CT lock which is in the + * path of dma fences (no memory allocations are allowed, faults require memory + * allocations) thus we cannot process faults under the CT lock. Another issue + * is faults issue TLB invalidations which require G2H credits and we cannot + * allocate G2H credits in the G2H handlers without deadlocking. Lastly, we do + * not want the CT lock to be an outer lock of the VM global lock (VM global + * lock required to fault processing). + * + * To work around the above issue with processing faults in the G2H worker, we + * sink faults to a buffer which is large enough to sink all possible faults on + * the GT (1 per hardware engine) and kick a worker to process the faults. Since + * the page faults G2H are already received in a worker, kicking another worker + * adds more latency to a critical performance path. We add a fast path in the + * G2H irq handler which looks at first G2H and if it is a page fault we sink + * the fault to the buffer and kick the worker to process the fault. TLB + * invalidation responses are also in the critical path so these can also be + * processed in this fast path. + * + * Multiple buffers and workers are used and hashed over based on the ASID so + * faults from different VMs can be processed in parallel. + * + * The page fault handler itself is rather simple, flow is below. + * + * .. code-block:: + * + * Lookup VM from ASID in page fault G2H + * Lock VM global lock in read mode + * Lookup VMA from address in page fault G2H + * Check if VMA is valid, if not bail + * Check if VMA's BO has backing store, if not allocate + * <----------------------------------------------------------------------| + * If userptr, pin pages | + * Lock VM & BO dma-resv locks | + * If atomic fault, migrate to VRAM, else validate BO location | + * Issue rebind | + * Wait on rebind to complete | + * Check if userptr invalidated since pin | + * Drop VM & BO dma-resv locks | + * Retry ---------------------------------------------------------- + * Unlock all + * Issue blocking TLB invalidation | + * Send page fault response to GuC + * + * Access counters + * --------------- + * + * Access counters can be configured to trigger a G2H indicating the device is + * accessing VMAs in system memory frequently as hint to migrate those VMAs to + * VRAM. + * + * Same as the page fault handler, access counters G2H cannot be processed the + * G2H worker under the CT lock. Again we use a buffer to sink access counter + * G2H. Unlike page faults there is no upper bound so if the buffer is full we + * simply drop the G2H. Access counters are a best case optimization and it is + * safe to drop these unlike page faults. + * + * The access counter handler itself is rather simple flow is below. + * + * .. code-block:: + * + * Lookup VM from ASID in access counter G2H + * Lock VM global lock in read mode + * Lookup VMA from address in access counter G2H + * If userptr, bail nothing to do + * Lock VM & BO dma-resv locks + * Issue migration to VRAM + * Unlock all + * + * Notice no rebind is issued in the access counter handler as the rebind will + * be issued on next page fault. + * + * Cavets with eviction / user pointer invalidation + * ------------------------------------------------ + * + * In the case of eviction and user pointer invalidation on a faulting VM, there + * is no need to issue a rebind rather we just need to blow away the page tables + * for the VMAs and the page fault handler will rebind the VMAs when they fault. + * The cavet is to update / read the page table structure the VM global lock is + * neeeed. In both the case of eviction and user pointer invalidation locks are + * held which make acquiring the VM global lock impossible. To work around this + * every VMA maintains a list of leaf page table entries which should be written + * to zero to blow away the VMA's page tables. After writing zero to these + * entries a blocking TLB invalidate is issued. At this point it is safe for the + * kernel to move the VMA's memory around. This is a necessary lockless + * algorithm and is safe as leafs cannot be changed while either an eviction or + * userptr invalidation is occurring. + * + * Locking + * ======= + * + * VM locking protects all of the core data paths (bind operations, execs, + * evictions, and compute mode rebind worker) in XE. + * + * Locks + * ----- + * + * VM global lock (vm->lock) - rw semaphore lock. Outer most lock which protects + * the list of userptrs mapped in the VM, the list of engines using this VM, and + * the array of external BOs mapped in the VM. When adding or removing any of the + * aforemented state from the VM should acquire this lock in write mode. The VM + * bind path also acquires this lock in write while the exec / compute mode + * rebind worker acquire this lock in read mode. + * + * VM dma-resv lock (vm->ttm.base.resv->lock) - WW lock. Protects VM dma-resv + * slots which is shared with any private BO in the VM. Expected to be acquired + * during VM binds, execs, and compute mode rebind worker. This lock is also + * held when private BOs are being evicted. + * + * external BO dma-resv lock (bo->ttm.base.resv->lock) - WW lock. Protects + * external BO dma-resv slots. Expected to be acquired during VM binds (in + * addition to the VM dma-resv lock). All external BO dma-locks within a VM are + * expected to be acquired (in addition to the VM dma-resv lock) during execs + * and the compute mode rebind worker. This lock is also held when an external + * BO is being evicted. + * + * Putting it all together + * ----------------------- + * + * 1. An exec and bind operation with the same VM can't be executing at the same + * time (vm->lock). + * + * 2. A compute mode rebind worker and bind operation with the same VM can't be + * executing at the same time (vm->lock). + * + * 3. We can't add / remove userptrs or external BOs to a VM while an exec with + * the same VM is executing (vm->lock). + * + * 4. We can't add / remove userptrs, external BOs, or engines to a VM while a + * compute mode rebind worker with the same VM is executing (vm->lock). + * + * 5. Evictions within a VM can't be happen while an exec with the same VM is + * executing (dma-resv locks). + * + * 6. Evictions within a VM can't be happen while a compute mode rebind worker + * with the same VM is executing (dma-resv locks). + * + * dma-resv usage + * ============== + * + * As previously stated to enforce the ordering of kernel ops (eviction, userptr + * invalidation, munmap style unbinds which result in a rebind), rebinds during + * execs, execs, and resumes in the rebind worker we use both the VMs and + * external BOs dma-resv slots. Let try to make this as clear as possible. + * + * Slot installation + * ----------------- + * + * 1. Jobs from kernel ops install themselves into the DMA_RESV_USAGE_KERNEL + * slot of either an external BO or VM (depends on if kernel op is operating on + * an external or private BO) + * + * 2. In non-compute mode, jobs from execs install themselves into the + * DMA_RESV_USAGE_BOOKKEEP slot of the VM + * + * 3. In non-compute mode, jobs from execs install themselves into the + * DMA_RESV_USAGE_WRITE slot of all external BOs in the VM + * + * 4. Jobs from binds install themselves into the DMA_RESV_USAGE_BOOKKEEP slot + * of the VM + * + * 5. Jobs from binds install themselves into the DMA_RESV_USAGE_BOOKKEEP slot + * of the external BO (if the bind is to an external BO, this is addition to #4) + * + * 6. Every engine using a compute mode VM has a preempt fence in installed into + * the DMA_RESV_USAGE_PREEMPT_FENCE slot of the VM + * + * 7. Every engine using a compute mode VM has a preempt fence in installed into + * the DMA_RESV_USAGE_PREEMPT_FENCE slot of all the external BOs in the VM + * + * Slot waiting + * ------------ + * + * 1. The exection of all jobs from kernel ops shall wait on all slots + * (DMA_RESV_USAGE_PREEMPT_FENCE) of either an external BO or VM (depends on if + * kernel op is operating on external or private BO) + * + * 2. In non-compute mode, the exection of all jobs from rebinds in execs shall + * wait on the DMA_RESV_USAGE_KERNEL slot of either an external BO or VM + * (depends on if the rebind is operatiing on an external or private BO) + * + * 3. In non-compute mode, the exection of all jobs from execs shall wait on the + * last rebind job + * + * 4. In compute mode, the exection of all jobs from rebinds in the rebind + * worker shall wait on the DMA_RESV_USAGE_KERNEL slot of either an external BO + * or VM (depends on if rebind is operating on external or private BO) + * + * 5. In compute mode, resumes in rebind worker shall wait on last rebind fence + * + * 6. In compute mode, resumes in rebind worker shall wait on the + * DMA_RESV_USAGE_KERNEL slot of the VM + * + * Putting it all together + * ----------------------- + * + * 1. New jobs from kernel ops are blocked behind any existing jobs from + * non-compute mode execs + * + * 2. New jobs from non-compute mode execs are blocked behind any existing jobs + * from kernel ops and rebinds + * + * 3. New jobs from kernel ops are blocked behind all preempt fences signaling in + * compute mode + * + * 4. Compute mode engine resumes are blocked behind any existing jobs from + * kernel ops and rebinds + * + * Future work + * =========== + * + * Support large pages for sysmem and userptr. + * + * Update page faults to handle BOs are page level grainularity (e.g. part of BO + * could be in system memory while another part could be in VRAM). + * + * Page fault handler likely we be optimized a bit more (e.g. Rebinds always + * wait on the dma-resv kernel slots of VM or BO, technically we only have to + * wait the BO moving. If using a job to do the rebind, we could not block in + * the page fault handler rather attach a callback to fence of the rebind job to + * signal page fault complete. Our handling of short circuting for atomic faults + * for bound VMAs could be better. etc...). We can tune all of this once we have + * benchmarks / performance number from workloads up and running. + */ + +#endif diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h new file mode 100644 index 000000000000..63e8a50b88e9 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -0,0 +1,373 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_VM_TYPES_H_ +#define _XE_VM_TYPES_H_ + +#include <drm/drm_gpuvm.h> + +#include <linux/dma-resv.h> +#include <linux/kref.h> +#include <linux/mmu_notifier.h> +#include <linux/scatterlist.h> + +#include "xe_device_types.h" +#include "xe_pt_types.h" +#include "xe_range_fence.h" + +struct xe_bo; +struct xe_sync_entry; +struct xe_vm; + +#define TEST_VM_ASYNC_OPS_ERROR +#define FORCE_ASYNC_OP_ERROR BIT(31) + +#define XE_VMA_READ_ONLY DRM_GPUVA_USERBITS +#define XE_VMA_DESTROYED (DRM_GPUVA_USERBITS << 1) +#define XE_VMA_ATOMIC_PTE_BIT (DRM_GPUVA_USERBITS << 2) +#define XE_VMA_FIRST_REBIND (DRM_GPUVA_USERBITS << 3) +#define XE_VMA_LAST_REBIND (DRM_GPUVA_USERBITS << 4) +#define XE_VMA_PTE_4K (DRM_GPUVA_USERBITS << 5) +#define XE_VMA_PTE_2M (DRM_GPUVA_USERBITS << 6) +#define XE_VMA_PTE_1G (DRM_GPUVA_USERBITS << 7) + +/** struct xe_userptr - User pointer */ +struct xe_userptr { + /** @invalidate_link: Link for the vm::userptr.invalidated list */ + struct list_head invalidate_link; + /** + * @notifier: MMU notifier for user pointer (invalidation call back) + */ + struct mmu_interval_notifier notifier; + /** @sgt: storage for a scatter gather table */ + struct sg_table sgt; + /** @sg: allocated scatter gather table */ + struct sg_table *sg; + /** @notifier_seq: notifier sequence number */ + unsigned long notifier_seq; + /** + * @initial_bind: user pointer has been bound at least once. + * write: vm->userptr.notifier_lock in read mode and vm->resv held. + * read: vm->userptr.notifier_lock in write mode or vm->resv held. + */ + bool initial_bind; +#if IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) + u32 divisor; +#endif +}; + +struct xe_vma { + /** @gpuva: Base GPUVA object */ + struct drm_gpuva gpuva; + + /** + * @combined_links: links into lists which are mutually exclusive. + * Locking: vm lock in write mode OR vm lock in read mode and the vm's + * resv. + */ + union { + /** @userptr: link into VM repin list if userptr. */ + struct list_head userptr; + /** @rebind: link into VM if this VMA needs rebinding. */ + struct list_head rebind; + /** @destroy: link to contested list when VM is being closed. */ + struct list_head destroy; + } combined_links; + + union { + /** @destroy_cb: callback to destroy VMA when unbind job is done */ + struct dma_fence_cb destroy_cb; + /** @destroy_work: worker to destroy this BO */ + struct work_struct destroy_work; + }; + + /** @usm: unified shared memory state */ + struct { + /** @tile_invalidated: VMA has been invalidated */ + u8 tile_invalidated; + } usm; + + /** @tile_mask: Tile mask of where to create binding for this VMA */ + u8 tile_mask; + + /** + * @tile_present: GT mask of binding are present for this VMA. + * protected by vm->lock, vm->resv and for userptrs, + * vm->userptr.notifier_lock for writing. Needs either for reading, + * but if reading is done under the vm->lock only, it needs to be held + * in write mode. + */ + u8 tile_present; + + /** + * @pat_index: The pat index to use when encoding the PTEs for this vma. + */ + u16 pat_index; + + /** + * @userptr: user pointer state, only allocated for VMAs that are + * user pointers + */ + struct xe_userptr userptr; +}; + +struct xe_device; + +struct xe_vm { + /** @gpuvm: base GPUVM used to track VMAs */ + struct drm_gpuvm gpuvm; + + struct xe_device *xe; + + /* exec queue used for (un)binding vma's */ + struct xe_exec_queue *q[XE_MAX_TILES_PER_DEVICE]; + + /** @lru_bulk_move: Bulk LRU move list for this VM's BOs */ + struct ttm_lru_bulk_move lru_bulk_move; + + u64 size; + + struct xe_pt *pt_root[XE_MAX_TILES_PER_DEVICE]; + struct xe_pt *scratch_pt[XE_MAX_TILES_PER_DEVICE][XE_VM_MAX_LEVEL]; + + /** + * @flags: flags for this VM, statically setup a creation time aside + * from XE_VM_FLAG_BANNED which requires vm->lock to set / read safely + */ +#define XE_VM_FLAG_64K BIT(0) +#define XE_VM_FLAG_LR_MODE BIT(1) +#define XE_VM_FLAG_MIGRATION BIT(2) +#define XE_VM_FLAG_SCRATCH_PAGE BIT(3) +#define XE_VM_FLAG_FAULT_MODE BIT(4) +#define XE_VM_FLAG_BANNED BIT(5) +#define XE_VM_FLAG_TILE_ID(flags) FIELD_GET(GENMASK(7, 6), flags) +#define XE_VM_FLAG_SET_TILE_ID(tile) FIELD_PREP(GENMASK(7, 6), (tile)->id) + unsigned long flags; + + /** @composite_fence_ctx: context composite fence */ + u64 composite_fence_ctx; + /** @composite_fence_seqno: seqno for composite fence */ + u32 composite_fence_seqno; + + /** + * @lock: outer most lock, protects objects of anything attached to this + * VM + */ + struct rw_semaphore lock; + + /** + * @rebind_list: list of VMAs that need rebinding. Protected by the + * vm->lock in write mode, OR (the vm->lock in read mode and the + * vm resv). + */ + struct list_head rebind_list; + + /** @rebind_fence: rebind fence from execbuf */ + struct dma_fence *rebind_fence; + + /** + * @destroy_work: worker to destroy VM, needed as a dma_fence signaling + * from an irq context can be last put and the destroy needs to be able + * to sleep. + */ + struct work_struct destroy_work; + + /** + * @rftree: range fence tree to track updates to page table structure. + * Used to implement conflict tracking between independent bind engines. + */ + struct xe_range_fence_tree rftree[XE_MAX_TILES_PER_DEVICE]; + + /** @async_ops: async VM operations (bind / unbinds) */ + struct { + /** @list: list of pending async VM ops */ + struct list_head pending; + /** @work: worker to execute async VM ops */ + struct work_struct work; + /** @lock: protects list of pending async VM ops and fences */ + spinlock_t lock; + /** @fence: fence state */ + struct { + /** @context: context of async fence */ + u64 context; + /** @seqno: seqno of async fence */ + u32 seqno; + } fence; + /** @error: error state for async VM ops */ + int error; + /** + * @munmap_rebind_inflight: an munmap style VM bind is in the + * middle of a set of ops which requires a rebind at the end. + */ + bool munmap_rebind_inflight; + } async_ops; + + const struct xe_pt_ops *pt_ops; + + /** @userptr: user pointer state */ + struct { + /** + * @userptr.repin_list: list of VMAs which are user pointers, + * and needs repinning. Protected by @lock. + */ + struct list_head repin_list; + /** + * @notifier_lock: protects notifier in write mode and + * submission in read mode. + */ + struct rw_semaphore notifier_lock; + /** + * @userptr.invalidated_lock: Protects the + * @userptr.invalidated list. + */ + spinlock_t invalidated_lock; + /** + * @userptr.invalidated: List of invalidated userptrs, not yet + * picked + * up for revalidation. Protected from access with the + * @invalidated_lock. Removing items from the list + * additionally requires @lock in write mode, and adding + * items to the list requires the @userptr.notifer_lock in + * write mode. + */ + struct list_head invalidated; + } userptr; + + /** @preempt: preempt state */ + struct { + /** + * @min_run_period_ms: The minimum run period before preempting + * an engine again + */ + s64 min_run_period_ms; + /** @exec_queues: list of exec queues attached to this VM */ + struct list_head exec_queues; + /** @num_exec_queues: number exec queues attached to this VM */ + int num_exec_queues; + /** + * @rebind_deactivated: Whether rebind has been temporarily deactivated + * due to no work available. Protected by the vm resv. + */ + bool rebind_deactivated; + /** + * @rebind_work: worker to rebind invalidated userptrs / evicted + * BOs + */ + struct work_struct rebind_work; + } preempt; + + /** @um: unified memory state */ + struct { + /** @asid: address space ID, unique to each VM */ + u32 asid; + /** + * @last_fault_vma: Last fault VMA, used for fast lookup when we + * get a flood of faults to the same VMA + */ + struct xe_vma *last_fault_vma; + } usm; + + /** @error_capture: allow to track errors */ + struct { + /** @capture_once: capture only one error per VM */ + bool capture_once; + } error_capture; + + /** @batch_invalidate_tlb: Always invalidate TLB before batch start */ + bool batch_invalidate_tlb; + /** @xef: XE file handle for tracking this VM's drm client */ + struct xe_file *xef; +}; + +/** struct xe_vma_op_map - VMA map operation */ +struct xe_vma_op_map { + /** @vma: VMA to map */ + struct xe_vma *vma; + /** @immediate: Immediate bind */ + bool immediate; + /** @read_only: Read only */ + bool read_only; + /** @is_null: is NULL binding */ + bool is_null; + /** @pat_index: The pat index to use for this operation. */ + u16 pat_index; +}; + +/** struct xe_vma_op_remap - VMA remap operation */ +struct xe_vma_op_remap { + /** @prev: VMA preceding part of a split mapping */ + struct xe_vma *prev; + /** @next: VMA subsequent part of a split mapping */ + struct xe_vma *next; + /** @start: start of the VMA unmap */ + u64 start; + /** @range: range of the VMA unmap */ + u64 range; + /** @skip_prev: skip prev rebind */ + bool skip_prev; + /** @skip_next: skip next rebind */ + bool skip_next; + /** @unmap_done: unmap operation in done */ + bool unmap_done; +}; + +/** struct xe_vma_op_prefetch - VMA prefetch operation */ +struct xe_vma_op_prefetch { + /** @region: memory region to prefetch to */ + u32 region; +}; + +/** enum xe_vma_op_flags - flags for VMA operation */ +enum xe_vma_op_flags { + /** @XE_VMA_OP_FIRST: first VMA operation for a set of syncs */ + XE_VMA_OP_FIRST = BIT(0), + /** @XE_VMA_OP_LAST: last VMA operation for a set of syncs */ + XE_VMA_OP_LAST = BIT(1), + /** @XE_VMA_OP_COMMITTED: VMA operation committed */ + XE_VMA_OP_COMMITTED = BIT(2), + /** @XE_VMA_OP_PREV_COMMITTED: Previous VMA operation committed */ + XE_VMA_OP_PREV_COMMITTED = BIT(3), + /** @XE_VMA_OP_NEXT_COMMITTED: Next VMA operation committed */ + XE_VMA_OP_NEXT_COMMITTED = BIT(4), +}; + +/** struct xe_vma_op - VMA operation */ +struct xe_vma_op { + /** @base: GPUVA base operation */ + struct drm_gpuva_op base; + /** + * @ops: GPUVA ops, when set call drm_gpuva_ops_free after this + * operations is processed + */ + struct drm_gpuva_ops *ops; + /** @q: exec queue for this operation */ + struct xe_exec_queue *q; + /** + * @syncs: syncs for this operation, only used on first and last + * operation + */ + struct xe_sync_entry *syncs; + /** @num_syncs: number of syncs */ + u32 num_syncs; + /** @link: async operation link */ + struct list_head link; + /** @flags: operation flags */ + enum xe_vma_op_flags flags; + +#ifdef TEST_VM_ASYNC_OPS_ERROR + /** @inject_error: inject error to test async op error handling */ + bool inject_error; +#endif + + union { + /** @map: VMA map operation specific data */ + struct xe_vma_op_map map; + /** @remap: VMA remap operation specific data */ + struct xe_vma_op_remap remap; + /** @prefetch: VMA prefetch operation specific data */ + struct xe_vma_op_prefetch prefetch; + }; +}; +#endif diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c new file mode 100644 index 000000000000..5f61dd87c586 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -0,0 +1,895 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_wa.h" + +#include <drm/drm_managed.h> +#include <kunit/visibility.h> +#include <linux/compiler_types.h> + +#include "generated/xe_wa_oob.h" +#include "regs/xe_engine_regs.h" +#include "regs/xe_gt_regs.h" +#include "regs/xe_regs.h" +#include "xe_device_types.h" +#include "xe_force_wake.h" +#include "xe_gt.h" +#include "xe_hw_engine_types.h" +#include "xe_mmio.h" +#include "xe_platform_types.h" +#include "xe_rtp.h" +#include "xe_step.h" + +/** + * DOC: Hardware workarounds + * + * Hardware workarounds are register programming documented to be executed in + * the driver that fall outside of the normal programming sequences for a + * platform. There are some basic categories of workarounds, depending on + * how/when they are applied: + * + * - LRC workarounds: workarounds that touch registers that are + * saved/restored to/from the HW context image. The list is emitted (via Load + * Register Immediate commands) once when initializing the device and saved in + * the default context. That default context is then used on every context + * creation to have a "primed golden context", i.e. a context image that + * already contains the changes needed to all the registers. + * + * - Engine workarounds: the list of these WAs is applied whenever the specific + * engine is reset. It's also possible that a set of engine classes share a + * common power domain and they are reset together. This happens on some + * platforms with render and compute engines. In this case (at least) one of + * them need to keeep the workaround programming: the approach taken in the + * driver is to tie those workarounds to the first compute/render engine that + * is registered. When executing with GuC submission, engine resets are + * outside of kernel driver control, hence the list of registers involved in + * written once, on engine initialization, and then passed to GuC, that + * saves/restores their values before/after the reset takes place. See + * ``drivers/gpu/drm/xe/xe_guc_ads.c`` for reference. + * + * - GT workarounds: the list of these WAs is applied whenever these registers + * revert to their default values: on GPU reset, suspend/resume [1]_, etc. + * + * - Register whitelist: some workarounds need to be implemented in userspace, + * but need to touch privileged registers. The whitelist in the kernel + * instructs the hardware to allow the access to happen. From the kernel side, + * this is just a special case of a MMIO workaround (as we write the list of + * these to/be-whitelisted registers to some special HW registers). + * + * - Workaround batchbuffers: buffers that get executed automatically by the + * hardware on every HW context restore. These buffers are created and + * programmed in the default context so the hardware always go through those + * programming sequences when switching contexts. The support for workaround + * batchbuffers is enabled these hardware mechanisms: + * + * #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default + * context, pointing the hardware to jump to that location when that offset + * is reached in the context restore. Workaround batchbuffer in the driver + * currently uses this mechanism for all platforms. + * + * #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context, + * pointing the hardware to a buffer to continue executing after the + * engine registers are restored in a context restore sequence. This is + * currently not used in the driver. + * + * - Other/OOB: There are WAs that, due to their nature, cannot be applied from + * a central place. Those are peppered around the rest of the code, as needed. + * Workarounds related to the display IP are the main example. + * + * .. [1] Technically, some registers are powercontext saved & restored, so they + * survive a suspend/resume. In practice, writing them again is not too + * costly and simplifies things, so it's the approach taken in the driver. + * + * .. note:: + * Hardware workarounds in xe work the same way as in i915, with the + * difference of how they are maintained in the code. In xe it uses the + * xe_rtp infrastructure so the workarounds can be kept in tables, following + * a more declarative approach rather than procedural. + */ + +#undef XE_REG_MCR +#define XE_REG_MCR(...) XE_REG(__VA_ARGS__, .mcr = 1) + +__diag_push(); +__diag_ignore_all("-Woverride-init", "Allow field overrides in table"); + +static const struct xe_rtp_entry_sr gt_was[] = { + { XE_RTP_NAME("14011060649"), + XE_RTP_RULES(MEDIA_VERSION_RANGE(1200, 1255), + ENGINE_CLASS(VIDEO_DECODE), + FUNC(xe_rtp_match_even_instance)), + XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F10(0), IECPUNIT_CLKGATE_DIS)), + XE_RTP_ENTRY_FLAG(FOREACH_ENGINE), + }, + { XE_RTP_NAME("14011059788"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210)), + XE_RTP_ACTIONS(SET(DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE)) + }, + { XE_RTP_NAME("14015795083"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1260)), + XE_RTP_ACTIONS(CLR(MISCCPCTL, DOP_CLOCK_GATE_RENDER_ENABLE)) + }, + + /* DG1 */ + + { XE_RTP_NAME("1409420604"), + XE_RTP_RULES(PLATFORM(DG1)), + XE_RTP_ACTIONS(SET(SUBSLICE_UNIT_LEVEL_CLKGATE2, CPSSUNIT_CLKGATE_DIS)) + }, + { XE_RTP_NAME("1408615072"), + XE_RTP_RULES(PLATFORM(DG1)), + XE_RTP_ACTIONS(SET(UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE2_DIS)) + }, + + /* DG2 */ + + { XE_RTP_NAME("16010515920"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), + GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(VIDEO_DECODE)), + XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F18(0), ALNUNIT_CLKGATE_DIS)), + XE_RTP_ENTRY_FLAG(FOREACH_ENGINE), + }, + { XE_RTP_NAME("22010523718"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10)), + XE_RTP_ACTIONS(SET(UNSLICE_UNIT_LEVEL_CLKGATE, CG3DDISCFEG_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14011006942"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10)), + XE_RTP_ACTIONS(SET(SUBSLICE_UNIT_LEVEL_CLKGATE, DSS_ROUTER_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14012362059"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB)) + }, + { XE_RTP_NAME("14012362059"), + XE_RTP_RULES(SUBPLATFORM(DG2, G11), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB)) + }, + { XE_RTP_NAME("14010948348"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(UNSLCGCTL9430, MSQDUNIT_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14011037102"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(UNSLCGCTL9444, LTCDD_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14011371254"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(XEHP_SLICE_UNIT_LEVEL_CLKGATE, NODEDSS_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14011431319"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(UNSLCGCTL9440, + GAMTLBOACS_CLKGATE_DIS | + GAMTLBVDBOX7_CLKGATE_DIS | GAMTLBVDBOX6_CLKGATE_DIS | + GAMTLBVDBOX5_CLKGATE_DIS | GAMTLBVDBOX4_CLKGATE_DIS | + GAMTLBVDBOX3_CLKGATE_DIS | GAMTLBVDBOX2_CLKGATE_DIS | + GAMTLBVDBOX1_CLKGATE_DIS | GAMTLBVDBOX0_CLKGATE_DIS | + GAMTLBKCR_CLKGATE_DIS | GAMTLBGUC_CLKGATE_DIS | + GAMTLBBLT_CLKGATE_DIS), + SET(UNSLCGCTL9444, + GAMTLBGFXA0_CLKGATE_DIS | GAMTLBGFXA1_CLKGATE_DIS | + GAMTLBCOMPA0_CLKGATE_DIS | GAMTLBCOMPA1_CLKGATE_DIS | + GAMTLBCOMPB0_CLKGATE_DIS | GAMTLBCOMPB1_CLKGATE_DIS | + GAMTLBCOMPC0_CLKGATE_DIS | GAMTLBCOMPC1_CLKGATE_DIS | + GAMTLBCOMPD0_CLKGATE_DIS | GAMTLBCOMPD1_CLKGATE_DIS | + GAMTLBMERT_CLKGATE_DIS | + GAMTLBVEBOX3_CLKGATE_DIS | GAMTLBVEBOX2_CLKGATE_DIS | + GAMTLBVEBOX1_CLKGATE_DIS | GAMTLBVEBOX0_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14010569222"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(UNSLICE_UNIT_LEVEL_CLKGATE, GAMEDIA_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14011028019"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(SSMCGCTL9530, RTFUNIT_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14010680813"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(XEHP_GAMSTLB_CTRL, + CONTROL_BLOCK_CLKGATE_DIS | + EGRESS_BLOCK_CLKGATE_DIS | + TAG_BLOCK_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14014830051"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(CLR(SARB_CHICKEN1, COMP_CKN_IN)) + }, + { XE_RTP_NAME("18018781329"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(SET(RENDER_MOD_CTRL, FORCE_MISS_FTLB), + SET(COMP_MOD_CTRL, FORCE_MISS_FTLB), + SET(XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB), + SET(XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB)) + }, + { XE_RTP_NAME("1509235366"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(SET(XEHP_GAMCNTRL_CTRL, + INVALIDATION_BROADCAST_MODE_DIS | + GLOBAL_INVALIDATION_MODE)) + }, + { XE_RTP_NAME("14010648519"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(SET(XEHP_L3NODEARBCFG, XEHP_LNESPARE)) + }, + + /* PVC */ + + { XE_RTP_NAME("18018781329"), + XE_RTP_RULES(PLATFORM(PVC)), + XE_RTP_ACTIONS(SET(RENDER_MOD_CTRL, FORCE_MISS_FTLB), + SET(COMP_MOD_CTRL, FORCE_MISS_FTLB), + SET(XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB), + SET(XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB)) + }, + { XE_RTP_NAME("16016694945"), + XE_RTP_RULES(PLATFORM(PVC)), + XE_RTP_ACTIONS(SET(XEHPC_LNCFMISCCFGREG0, XEHPC_OVRLSCCC)) + }, + + /* Xe_LPG */ + + { XE_RTP_NAME("14015795083"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1270, 1271), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(CLR(MISCCPCTL, DOP_CLOCK_GATE_RENDER_ENABLE)) + }, + { XE_RTP_NAME("14018575942"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1270, 1271)), + XE_RTP_ACTIONS(SET(COMP_MOD_CTRL, FORCE_MISS_FTLB)) + }, + { XE_RTP_NAME("22016670082"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1270, 1271)), + XE_RTP_ACTIONS(SET(SQCNT1, ENFORCE_RAR)) + }, + + /* Xe_LPM+ */ + + { XE_RTP_NAME("16021867713"), + XE_RTP_RULES(MEDIA_VERSION(1300), + ENGINE_CLASS(VIDEO_DECODE)), + XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F1C(0), MFXPIPE_CLKGATE_DIS)), + XE_RTP_ENTRY_FLAG(FOREACH_ENGINE), + }, + { XE_RTP_NAME("22016670082"), + XE_RTP_RULES(MEDIA_VERSION(1300)), + XE_RTP_ACTIONS(SET(XELPMP_SQCNT1, ENFORCE_RAR)) + }, + + /* Xe2_LPG */ + + { XE_RTP_NAME("16020975621"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(XEHP_SLICE_UNIT_LEVEL_CLKGATE, SBEUNIT_CLKGATE_DIS)) + }, + { XE_RTP_NAME("14018157293"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(XEHPC_L3CLOS_MASK(0), ~0), + SET(XEHPC_L3CLOS_MASK(1), ~0), + SET(XEHPC_L3CLOS_MASK(2), ~0), + SET(XEHPC_L3CLOS_MASK(3), ~0)) + }, + + /* Xe2_LPM */ + + { XE_RTP_NAME("14017421178"), + XE_RTP_RULES(MEDIA_VERSION(2000), + ENGINE_CLASS(VIDEO_DECODE)), + XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F10(0), IECPUNIT_CLKGATE_DIS)), + XE_RTP_ENTRY_FLAG(FOREACH_ENGINE), + }, + { XE_RTP_NAME("16021867713"), + XE_RTP_RULES(MEDIA_VERSION(2000), + ENGINE_CLASS(VIDEO_DECODE)), + XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F1C(0), MFXPIPE_CLKGATE_DIS)), + XE_RTP_ENTRY_FLAG(FOREACH_ENGINE), + }, + { XE_RTP_NAME("14019449301"), + XE_RTP_RULES(MEDIA_VERSION(2000), ENGINE_CLASS(VIDEO_DECODE)), + XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F08(0), CG3DDISHRS_CLKGATE_DIS)), + XE_RTP_ENTRY_FLAG(FOREACH_ENGINE), + }, + + {} +}; + +static const struct xe_rtp_entry_sr engine_was[] = { + { XE_RTP_NAME("22010931296, 18011464164, 14010919138"), + XE_RTP_RULES(GRAPHICS_VERSION(1200), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(FF_THREAD_MODE(RENDER_RING_BASE), + FF_TESSELATION_DOP_GATE_DISABLE)) + }, + { XE_RTP_NAME("1409804808"), + XE_RTP_RULES(GRAPHICS_VERSION(1200), + ENGINE_CLASS(RENDER), + IS_INTEGRATED), + XE_RTP_ACTIONS(SET(ROW_CHICKEN2, PUSH_CONST_DEREF_HOLD_DIS)) + }, + { XE_RTP_NAME("14010229206, 1409085225"), + XE_RTP_RULES(GRAPHICS_VERSION(1200), + ENGINE_CLASS(RENDER), + IS_INTEGRATED), + XE_RTP_ACTIONS(SET(ROW_CHICKEN4, DISABLE_TDL_PUSH)) + }, + { XE_RTP_NAME("1606931601"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN2, DISABLE_EARLY_READ)) + }, + { XE_RTP_NAME("14010826681, 1606700617, 22010271021, 18019627453"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1255), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(CS_DEBUG_MODE1(RENDER_RING_BASE), + FF_DOP_CLOCK_GATE_DISABLE)) + }, + { XE_RTP_NAME("1406941453"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(SAMPLER_MODE, ENABLE_SMALLPL)) + }, + { XE_RTP_NAME("FtrPerCtxtPreemptionGranularityControl"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1250), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(FF_SLICE_CS_CHICKEN1(RENDER_RING_BASE), + FFSC_PERCTX_PREEMPT_CTRL)) + }, + + /* TGL */ + + { XE_RTP_NAME("1607297627, 1607030317, 1607186500"), + XE_RTP_RULES(PLATFORM(TIGERLAKE), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(RING_PSMI_CTL(RENDER_RING_BASE), + WAIT_FOR_EVENT_POWER_DOWN_DISABLE | + RC_SEMA_IDLE_MSG_DISABLE)) + }, + + /* RKL */ + + { XE_RTP_NAME("1607297627, 1607030317, 1607186500"), + XE_RTP_RULES(PLATFORM(ROCKETLAKE), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(RING_PSMI_CTL(RENDER_RING_BASE), + WAIT_FOR_EVENT_POWER_DOWN_DISABLE | + RC_SEMA_IDLE_MSG_DISABLE)) + }, + + /* ADL-P */ + + { XE_RTP_NAME("1607297627, 1607030317, 1607186500"), + XE_RTP_RULES(PLATFORM(ALDERLAKE_P), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(RING_PSMI_CTL(RENDER_RING_BASE), + WAIT_FOR_EVENT_POWER_DOWN_DISABLE | + RC_SEMA_IDLE_MSG_DISABLE)) + }, + + /* DG2 */ + + { XE_RTP_NAME("22013037850"), + XE_RTP_RULES(PLATFORM(DG2), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0_UDW, + DISABLE_128B_EVICTION_COMMAND_UDW)) + }, + { XE_RTP_NAME("22014226127"), + XE_RTP_RULES(PLATFORM(DG2), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE)) + }, + { XE_RTP_NAME("18017747507"), + XE_RTP_RULES(PLATFORM(DG2), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(VFG_PREEMPTION_CHICKEN, + POLYGON_TRIFAN_LINELOOP_DISABLE)) + }, + { XE_RTP_NAME("22012826095, 22013059131"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(B0, C0), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(FIELD_SET(LSC_CHICKEN_BIT_0_UDW, + MAXREQS_PER_BANK, + REG_FIELD_PREP(MAXREQS_PER_BANK, 2))) + }, + { XE_RTP_NAME("22012826095, 22013059131"), + XE_RTP_RULES(SUBPLATFORM(DG2, G11), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(FIELD_SET(LSC_CHICKEN_BIT_0_UDW, + MAXREQS_PER_BANK, + REG_FIELD_PREP(MAXREQS_PER_BANK, 2))) + }, + { XE_RTP_NAME("22013059131"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(B0, C0), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0, FORCE_1_SUB_MESSAGE_PER_FRAGMENT)) + }, + { XE_RTP_NAME("22013059131"), + XE_RTP_RULES(SUBPLATFORM(DG2, G11), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0, FORCE_1_SUB_MESSAGE_PER_FRAGMENT)) + }, + { XE_RTP_NAME("14010918519"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0_UDW, + FORCE_SLM_FENCE_SCOPE_TO_TILE | + FORCE_UGM_FENCE_SCOPE_TO_TILE, + /* + * Ignore read back as it always returns 0 in these + * steps + */ + .read_mask = 0)) + }, + { XE_RTP_NAME("14015227452"), + XE_RTP_RULES(PLATFORM(DG2), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE)) + }, + { XE_RTP_NAME("16015675438"), + XE_RTP_RULES(PLATFORM(DG2), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(FF_SLICE_CS_CHICKEN2(RENDER_RING_BASE), + PERF_FIX_BALANCING_CFE_DISABLE)) + }, + { XE_RTP_NAME("18028616096"), + XE_RTP_RULES(PLATFORM(DG2), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0_UDW, UGM_FRAGMENT_THRESHOLD_TO_3)) + }, + { XE_RTP_NAME("16011620976, 22015475538"), + XE_RTP_RULES(PLATFORM(DG2), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8)) + }, + { XE_RTP_NAME("22012654132"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, C0), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(CACHE_MODE_SS, ENABLE_PREFETCH_INTO_IC, + /* + * Register can't be read back for verification on + * DG2 due to Wa_14012342262 + */ + .read_mask = 0)) + }, + { XE_RTP_NAME("22012654132"), + XE_RTP_RULES(SUBPLATFORM(DG2, G11), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(CACHE_MODE_SS, ENABLE_PREFETCH_INTO_IC, + /* + * Register can't be read back for verification on + * DG2 due to Wa_14012342262 + */ + .read_mask = 0)) + }, + { XE_RTP_NAME("1509727124"), + XE_RTP_RULES(PLATFORM(DG2), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(SAMPLER_MODE, SC_DISABLE_POWER_OPTIMIZATION_EBB)) + }, + { XE_RTP_NAME("22012856258"), + XE_RTP_RULES(PLATFORM(DG2), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN2, DISABLE_READ_SUPPRESSION)) + }, + { XE_RTP_NAME("14013392000"), + XE_RTP_RULES(SUBPLATFORM(DG2, G11), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN2, ENABLE_LARGE_GRF_MODE)) + }, + { XE_RTP_NAME("14012419201"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN4, + DISABLE_HDR_PAST_PAYLOAD_HOLD_FIX)) + }, + { XE_RTP_NAME("14012419201"), + XE_RTP_RULES(SUBPLATFORM(DG2, G11), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN4, + DISABLE_HDR_PAST_PAYLOAD_HOLD_FIX)) + }, + { XE_RTP_NAME("1308578152"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(B0, C0), + ENGINE_CLASS(RENDER), + FUNC(xe_rtp_match_first_gslice_fused_off)), + XE_RTP_ACTIONS(CLR(CS_DEBUG_MODE1(RENDER_RING_BASE), + REPLAY_MODE_GRANULARITY)) + }, + { XE_RTP_NAME("22010960976, 14013347512"), + XE_RTP_RULES(PLATFORM(DG2), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(CLR(XEHP_HDC_CHICKEN0, + LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK)) + }, + { XE_RTP_NAME("1608949956, 14010198302"), + XE_RTP_RULES(PLATFORM(DG2), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN, + MDQ_ARBITRATION_MODE | UGM_BACKUP_MODE)) + }, + { XE_RTP_NAME("22010430635"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN4, + DISABLE_GRF_CLEAR)) + }, + { XE_RTP_NAME("14013202645"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(B0, C0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(RT_CTRL, DIS_NULL_QUERY)) + }, + { XE_RTP_NAME("14013202645"), + XE_RTP_RULES(SUBPLATFORM(DG2, G11), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(RT_CTRL, DIS_NULL_QUERY)) + }, + { XE_RTP_NAME("22012532006"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, C0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(HALF_SLICE_CHICKEN7, + DG2_DISABLE_ROUND_ENABLE_ALLOW_FOR_SSLA)) + }, + { XE_RTP_NAME("22012532006"), + XE_RTP_RULES(SUBPLATFORM(DG2, G11), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(HALF_SLICE_CHICKEN7, + DG2_DISABLE_ROUND_ENABLE_ALLOW_FOR_SSLA)) + }, + { XE_RTP_NAME("14015150844"), + XE_RTP_RULES(PLATFORM(DG2), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(XEHP_HDC_CHICKEN0, DIS_ATOMIC_CHAINING_TYPED_WRITES, + XE_RTP_NOCHECK)) + }, + + /* PVC */ + + { XE_RTP_NAME("22014226127"), + XE_RTP_RULES(PLATFORM(PVC), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE)) + }, + { XE_RTP_NAME("14015227452"), + XE_RTP_RULES(PLATFORM(PVC), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE)) + }, + { XE_RTP_NAME("16015675438"), + XE_RTP_RULES(PLATFORM(PVC), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(FF_SLICE_CS_CHICKEN2(RENDER_RING_BASE), + PERF_FIX_BALANCING_CFE_DISABLE)) + }, + { XE_RTP_NAME("14014999345"), + XE_RTP_RULES(PLATFORM(PVC), ENGINE_CLASS(COMPUTE), + GRAPHICS_STEP(B0, C0)), + XE_RTP_ACTIONS(SET(CACHE_MODE_SS, DISABLE_ECC)) + }, + + /* Xe_LPG */ + + { XE_RTP_NAME("14017856879"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1270, 1271), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN3, DIS_FIX_EOT1_FLUSH)) + }, + { XE_RTP_NAME("14015150844"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1270, 1271), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(XEHP_HDC_CHICKEN0, DIS_ATOMIC_CHAINING_TYPED_WRITES, + XE_RTP_NOCHECK)) + }, + + /* Xe2_LPG */ + + { XE_RTP_NAME("18032247524"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0, SEQUENTIAL_ACCESS_UPGRADE_DISABLE)) + }, + { XE_RTP_NAME("16018712365"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0_UDW, XE2_ALLOC_DPA_STARVE_FIX_DIS)) + }, + { XE_RTP_NAME("14018957109"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), GRAPHICS_STEP(A0, B0), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(HALF_SLICE_CHICKEN5, DISABLE_SAMPLE_G_PERFORMANCE)) + }, + { XE_RTP_NAME("16021540221"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), GRAPHICS_STEP(A0, B0), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN4, DISABLE_TDL_PUSH)) + }, + { XE_RTP_NAME("14019322943"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), GRAPHICS_STEP(A0, B0), + FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0, TGM_WRITE_EOM_FORCE)) + }, + { XE_RTP_NAME("14018471104"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(LSC_CHICKEN_BIT_0_UDW, ENABLE_SMP_LD_RENDER_SURFACE_CONTROL)) + }, + { XE_RTP_NAME("16018737384"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), FUNC(xe_rtp_match_first_render_or_compute)), + XE_RTP_ACTIONS(SET(ROW_CHICKEN, EARLY_EOT_DIS)) + }, + /* + * These two workarounds are the same, just applying to different + * engines. Although Wa_18032095049 (for the RCS) isn't required on + * all steppings, disabling these reports has no impact for our + * driver or the GuC, so we go ahead and treat it the same as + * Wa_16021639441 which does apply to all steppings. + */ + { XE_RTP_NAME("18032095049, 16021639441"), + XE_RTP_RULES(GRAPHICS_VERSION(2004)), + XE_RTP_ACTIONS(SET(CSFE_CHICKEN1(0), + GHWSP_CSB_REPORT_DIS | + PPHWSP_CSB_AND_TIMESTAMP_REPORT_DIS, + XE_RTP_ACTION_FLAG(ENGINE_BASE))) + }, + + {} +}; + +static const struct xe_rtp_entry_sr lrc_was[] = { + { XE_RTP_NAME("1409342910, 14010698770, 14010443199, 1408979724, 1409178076, 1409207793, 1409217633, 1409252684, 1409347922, 1409142259"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210)), + XE_RTP_ACTIONS(SET(COMMON_SLICE_CHICKEN3, + DISABLE_CPS_AWARE_COLOR_PIPE)) + }, + { XE_RTP_NAME("WaDisableGPGPUMidThreadPreemption"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1200, 1210)), + XE_RTP_ACTIONS(FIELD_SET(CS_CHICKEN1(RENDER_RING_BASE), + PREEMPT_GPGPU_LEVEL_MASK, + PREEMPT_GPGPU_THREAD_GROUP_LEVEL)) + }, + { XE_RTP_NAME("1806527549"), + XE_RTP_RULES(GRAPHICS_VERSION(1200)), + XE_RTP_ACTIONS(SET(HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE)) + }, + { XE_RTP_NAME("1606376872"), + XE_RTP_RULES(GRAPHICS_VERSION(1200)), + XE_RTP_ACTIONS(SET(COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC)) + }, + + /* DG1 */ + + { XE_RTP_NAME("1409044764"), + XE_RTP_RULES(PLATFORM(DG1)), + XE_RTP_ACTIONS(CLR(COMMON_SLICE_CHICKEN3, + DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN)) + }, + { XE_RTP_NAME("22010493298"), + XE_RTP_RULES(PLATFORM(DG1)), + XE_RTP_ACTIONS(SET(HIZ_CHICKEN, + DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE)) + }, + + /* DG2 */ + + { XE_RTP_NAME("16011186671"), + XE_RTP_RULES(SUBPLATFORM(DG2, G11), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(CLR(VFLSKPD, DIS_MULT_MISS_RD_SQUASH), + SET(VFLSKPD, DIS_OVER_FETCH_CACHE)) + }, + { XE_RTP_NAME("14010469329"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(XEHP_COMMON_SLICE_CHICKEN3, + XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE)) + }, + { XE_RTP_NAME("14010698770, 22010613112, 22010465075"), + XE_RTP_RULES(SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0)), + XE_RTP_ACTIONS(SET(XEHP_COMMON_SLICE_CHICKEN3, + DISABLE_CPS_AWARE_COLOR_PIPE)) + }, + { XE_RTP_NAME("16013271637"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(SET(XEHP_SLICE_COMMON_ECO_CHICKEN1, + MSC_MSAA_REODER_BUF_BYPASS_DISABLE)) + }, + { XE_RTP_NAME("14014947963"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(FIELD_SET(VF_PREEMPTION, + PREEMPTION_VERTEX_COUNT, + 0x4000)) + }, + { XE_RTP_NAME("18018764978"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(SET(XEHP_PSS_MODE2, + SCOREBOARD_STALL_FLUSH_CONTROL)) + }, + { XE_RTP_NAME("18019271663"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(SET(CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE)) + }, + { XE_RTP_NAME("14019877138"), + XE_RTP_RULES(PLATFORM(DG2)), + XE_RTP_ACTIONS(SET(XEHP_PSS_CHICKEN, FD_END_COLLECT)) + }, + + /* PVC */ + + { XE_RTP_NAME("16017236439"), + XE_RTP_RULES(PLATFORM(PVC), ENGINE_CLASS(COPY), + FUNC(xe_rtp_match_even_instance)), + XE_RTP_ACTIONS(SET(BCS_SWCTRL(0), + BCS_SWCTRL_DISABLE_256B, + XE_RTP_ACTION_FLAG(ENGINE_BASE))), + }, + + /* Xe_LPG */ + + { XE_RTP_NAME("18019271663"), + XE_RTP_RULES(GRAPHICS_VERSION_RANGE(1270, 1271)), + XE_RTP_ACTIONS(SET(CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE)) + }, + + /* Xe2_LPG */ + + { XE_RTP_NAME("16020518922"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(FF_MODE, + DIS_TE_AUTOSTRIP | + DIS_MESH_PARTIAL_AUTOSTRIP | + DIS_MESH_AUTOSTRIP), + SET(VFLSKPD, + DIS_PARTIAL_AUTOSTRIP | + DIS_AUTOSTRIP)) + }, + { XE_RTP_NAME("14019386621"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(VF_SCRATCHPAD, XE2_VFG_TED_CREDIT_INTERFACE_DISABLE)) + }, + { XE_RTP_NAME("14019877138"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(XEHP_PSS_CHICKEN, FD_END_COLLECT)) + }, + { XE_RTP_NAME("14020013138"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), GRAPHICS_STEP(A0, B0), + ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(WM_CHICKEN3, HIZ_PLANE_COMPRESSION_DIS)) + }, + { XE_RTP_NAME("14019988906"), + XE_RTP_RULES(GRAPHICS_VERSION(2004), ENGINE_CLASS(RENDER)), + XE_RTP_ACTIONS(SET(XEHP_PSS_CHICKEN, FLSH_IGNORES_PSD)) + }, + + {} +}; + +static __maybe_unused const struct xe_rtp_entry oob_was[] = { +#include <generated/xe_wa_oob.c> + {} +}; + +static_assert(ARRAY_SIZE(oob_was) - 1 == _XE_WA_OOB_COUNT); + +__diag_pop(); + +/** + * xe_wa_process_oob - process OOB workaround table + * @gt: GT instance to process workarounds for + * + * Process OOB workaround table for this platform, marking in @gt the + * workarounds that are active. + */ +void xe_wa_process_oob(struct xe_gt *gt) +{ + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(gt); + + xe_rtp_process_ctx_enable_active_tracking(&ctx, gt->wa_active.oob, + ARRAY_SIZE(oob_was)); + xe_rtp_process(&ctx, oob_was); +} + +/** + * xe_wa_process_gt - process GT workaround table + * @gt: GT instance to process workarounds for + * + * Process GT workaround table for this platform, saving in @gt all the + * workarounds that need to be applied at the GT level. + */ +void xe_wa_process_gt(struct xe_gt *gt) +{ + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(gt); + + xe_rtp_process_ctx_enable_active_tracking(&ctx, gt->wa_active.gt, + ARRAY_SIZE(gt_was)); + xe_rtp_process_to_sr(&ctx, gt_was, >->reg_sr); +} +EXPORT_SYMBOL_IF_KUNIT(xe_wa_process_gt); + +/** + * xe_wa_process_engine - process engine workaround table + * @hwe: engine instance to process workarounds for + * + * Process engine workaround table for this platform, saving in @hwe all the + * workarounds that need to be applied at the engine level that match this + * engine. + */ +void xe_wa_process_engine(struct xe_hw_engine *hwe) +{ + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe); + + xe_rtp_process_ctx_enable_active_tracking(&ctx, hwe->gt->wa_active.engine, + ARRAY_SIZE(engine_was)); + xe_rtp_process_to_sr(&ctx, engine_was, &hwe->reg_sr); +} + +/** + * xe_wa_process_lrc - process context workaround table + * @hwe: engine instance to process workarounds for + * + * Process context workaround table for this platform, saving in @hwe all the + * workarounds that need to be applied on context restore. These are workarounds + * touching registers that are part of the HW context image. + */ +void xe_wa_process_lrc(struct xe_hw_engine *hwe) +{ + struct xe_rtp_process_ctx ctx = XE_RTP_PROCESS_CTX_INITIALIZER(hwe); + + xe_rtp_process_ctx_enable_active_tracking(&ctx, hwe->gt->wa_active.lrc, + ARRAY_SIZE(lrc_was)); + xe_rtp_process_to_sr(&ctx, lrc_was, &hwe->reg_lrc); +} + +/** + * xe_wa_init - initialize gt with workaround bookkeeping + * @gt: GT instance to initialize + * + * Returns 0 for success, negative error code otherwise. + */ +int xe_wa_init(struct xe_gt *gt) +{ + struct xe_device *xe = gt_to_xe(gt); + size_t n_oob, n_lrc, n_engine, n_gt, total; + unsigned long *p; + + n_gt = BITS_TO_LONGS(ARRAY_SIZE(gt_was)); + n_engine = BITS_TO_LONGS(ARRAY_SIZE(engine_was)); + n_lrc = BITS_TO_LONGS(ARRAY_SIZE(lrc_was)); + n_oob = BITS_TO_LONGS(ARRAY_SIZE(oob_was)); + total = n_gt + n_engine + n_lrc + n_oob; + + p = drmm_kzalloc(&xe->drm, sizeof(*p) * total, GFP_KERNEL); + if (!p) + return -ENOMEM; + + gt->wa_active.gt = p; + p += n_gt; + gt->wa_active.engine = p; + p += n_engine; + gt->wa_active.lrc = p; + p += n_lrc; + gt->wa_active.oob = p; + + return 0; +} + +void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p) +{ + size_t idx; + + drm_printf(p, "GT Workarounds\n"); + for_each_set_bit(idx, gt->wa_active.gt, ARRAY_SIZE(gt_was)) + drm_printf_indent(p, 1, "%s\n", gt_was[idx].name); + + drm_printf(p, "\nEngine Workarounds\n"); + for_each_set_bit(idx, gt->wa_active.engine, ARRAY_SIZE(engine_was)) + drm_printf_indent(p, 1, "%s\n", engine_was[idx].name); + + drm_printf(p, "\nLRC Workarounds\n"); + for_each_set_bit(idx, gt->wa_active.lrc, ARRAY_SIZE(lrc_was)) + drm_printf_indent(p, 1, "%s\n", lrc_was[idx].name); + + drm_printf(p, "\nOOB Workarounds\n"); + for_each_set_bit(idx, gt->wa_active.oob, ARRAY_SIZE(oob_was)) + if (oob_was[idx].name) + drm_printf_indent(p, 1, "%s\n", oob_was[idx].name); +} + +/* + * Apply tile (non-GT, non-display) workarounds. Think very carefully before + * adding anything to this function; most workarounds should be implemented + * elsewhere. The programming here is primarily for sgunit/soc workarounds, + * which are relatively rare. Since the registers these workarounds target are + * outside the GT, they should only need to be applied once at device + * probe/resume; they will not lose their values on any kind of GT or engine + * reset. + * + * TODO: We may want to move this over to xe_rtp in the future once we have + * enough workarounds to justify the work. + */ +void xe_wa_apply_tile_workarounds(struct xe_tile *tile) +{ + struct xe_gt *mmio = tile->primary_gt; + + if (XE_WA(mmio, 22010954014)) + xe_mmio_rmw32(mmio, XEHP_CLOCK_GATE_DIS, 0, SGSI_SIDECLK_DIS); +} diff --git a/drivers/gpu/drm/xe/xe_wa.h b/drivers/gpu/drm/xe/xe_wa.h new file mode 100644 index 000000000000..1b24d66f9d80 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_wa.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_WA_ +#define _XE_WA_ + +struct drm_printer; +struct xe_gt; +struct xe_hw_engine; +struct xe_tile; + +int xe_wa_init(struct xe_gt *gt); +void xe_wa_process_oob(struct xe_gt *gt); +void xe_wa_process_gt(struct xe_gt *gt); +void xe_wa_process_engine(struct xe_hw_engine *hwe); +void xe_wa_process_lrc(struct xe_hw_engine *hwe); +void xe_wa_apply_tile_workarounds(struct xe_tile *tile); + +void xe_reg_whitelist_process_engine(struct xe_hw_engine *hwe); +void xe_wa_dump(struct xe_gt *gt, struct drm_printer *p); + +/** + * XE_WA - Out-of-band workarounds, that don't fit the lifecycle any + * other more specific type + * @gt__: gt instance + * @id__: XE_OOB_<id__>, as generated by build system in generated/xe_wa_oob.h + */ +#define XE_WA(gt__, id__) test_bit(XE_WA_OOB_ ## id__, (gt__)->wa_active.oob) + +#endif diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules new file mode 100644 index 000000000000..727bdc429212 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -0,0 +1,24 @@ +22012773006 GRAPHICS_VERSION_RANGE(1200, 1250) +16011759253 SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, B0) +14014475959 GRAPHICS_VERSION_RANGE(1270, 1271), GRAPHICS_STEP(A0, B0) + PLATFORM(DG2) +22011391025 PLATFORM(DG2) +14012197797 PLATFORM(DG2), GRAPHICS_STEP(A0, B0) +16011777198 SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, C0) + SUBPLATFORM(DG2, G11), GRAPHICS_STEP(A0, B0) +22012727170 SUBPLATFORM(DG2, G10), GRAPHICS_STEP(A0, C0) + SUBPLATFORM(DG2, G11) +22012727685 SUBPLATFORM(DG2, G11) +16015675438 PLATFORM(PVC) + SUBPLATFORM(DG2, G10) + SUBPLATFORM(DG2, G12) +18020744125 PLATFORM(PVC) +1509372804 PLATFORM(PVC), GRAPHICS_STEP(A0, C0) +1409600907 GRAPHICS_VERSION_RANGE(1200, 1250) +14016763929 SUBPLATFORM(DG2, G10) + SUBPLATFORM(DG2, G12) +16017236439 PLATFORM(PVC) +22010954014 PLATFORM(DG2) +14019821291 MEDIA_VERSION_RANGE(1300, 2000) +14015076503 MEDIA_VERSION(1300) +16020292621 GRAPHICS_VERSION(2004), GRAPHICS_STEP(A0, B0) diff --git a/drivers/gpu/drm/xe/xe_wait_user_fence.c b/drivers/gpu/drm/xe/xe_wait_user_fence.c new file mode 100644 index 000000000000..a75eeba7bfe5 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_wait_user_fence.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_wait_user_fence.h" + +#include <drm/drm_device.h> +#include <drm/drm_file.h> +#include <drm/drm_utils.h> +#include <drm/xe_drm.h> + +#include "xe_device.h" +#include "xe_gt.h" +#include "xe_macros.h" +#include "xe_exec_queue.h" + +static int do_compare(u64 addr, u64 value, u64 mask, u16 op) +{ + u64 rvalue; + int err; + bool passed; + + err = copy_from_user(&rvalue, u64_to_user_ptr(addr), sizeof(rvalue)); + if (err) + return -EFAULT; + + switch (op) { + case DRM_XE_UFENCE_WAIT_OP_EQ: + passed = (rvalue & mask) == (value & mask); + break; + case DRM_XE_UFENCE_WAIT_OP_NEQ: + passed = (rvalue & mask) != (value & mask); + break; + case DRM_XE_UFENCE_WAIT_OP_GT: + passed = (rvalue & mask) > (value & mask); + break; + case DRM_XE_UFENCE_WAIT_OP_GTE: + passed = (rvalue & mask) >= (value & mask); + break; + case DRM_XE_UFENCE_WAIT_OP_LT: + passed = (rvalue & mask) < (value & mask); + break; + case DRM_XE_UFENCE_WAIT_OP_LTE: + passed = (rvalue & mask) <= (value & mask); + break; + default: + XE_WARN_ON("Not possible"); + return -EINVAL; + } + + return passed ? 0 : 1; +} + +#define VALID_FLAGS DRM_XE_UFENCE_WAIT_FLAG_ABSTIME +#define MAX_OP DRM_XE_UFENCE_WAIT_OP_LTE + +static long to_jiffies_timeout(struct xe_device *xe, + struct drm_xe_wait_user_fence *args) +{ + unsigned long long t; + long timeout; + + /* + * For negative timeout we want to wait "forever" by setting + * MAX_SCHEDULE_TIMEOUT. But we have to assign this value also + * to args->timeout to avoid being zeroed on the signal delivery + * (see arithmetics after wait). + */ + if (args->timeout < 0) { + args->timeout = MAX_SCHEDULE_TIMEOUT; + return MAX_SCHEDULE_TIMEOUT; + } + + if (args->timeout == 0) + return 0; + + /* + * Save the timeout to an u64 variable because nsecs_to_jiffies + * might return a value that overflows s32 variable. + */ + if (args->flags & DRM_XE_UFENCE_WAIT_FLAG_ABSTIME) + t = drm_timeout_abs_to_jiffies(args->timeout); + else + t = nsecs_to_jiffies(args->timeout); + + /* + * Anything greater then MAX_SCHEDULE_TIMEOUT is meaningless, + * also we don't want to cap it at MAX_SCHEDULE_TIMEOUT because + * apparently user doesn't mean to wait forever, otherwise the + * args->timeout should have been set to a negative value. + */ + if (t > MAX_SCHEDULE_TIMEOUT) + timeout = MAX_SCHEDULE_TIMEOUT - 1; + else + timeout = t; + + return timeout ?: 1; +} + +int xe_wait_user_fence_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_device *xe = to_xe_device(dev); + struct xe_file *xef = to_xe_file(file); + DEFINE_WAIT_FUNC(w_wait, woken_wake_function); + struct drm_xe_wait_user_fence *args = data; + struct xe_exec_queue *q = NULL; + u64 addr = args->addr; + int err = 0; + long timeout; + ktime_t start; + + if (XE_IOCTL_DBG(xe, args->extensions) || XE_IOCTL_DBG(xe, args->pad) || + XE_IOCTL_DBG(xe, args->pad2) || + XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->flags & ~VALID_FLAGS)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, args->op > MAX_OP)) + return -EINVAL; + + if (XE_IOCTL_DBG(xe, addr & 0x7)) + return -EINVAL; + + if (args->exec_queue_id) { + q = xe_exec_queue_lookup(xef, args->exec_queue_id); + if (XE_IOCTL_DBG(xe, !q)) + return -ENOENT; + } + + timeout = to_jiffies_timeout(xe, args); + + start = ktime_get(); + + add_wait_queue(&xe->ufence_wq, &w_wait); + for (;;) { + err = do_compare(addr, args->value, args->mask, args->op); + if (err <= 0) + break; + + if (signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + if (q) { + if (q->ops->reset_status(q)) { + drm_info(&xe->drm, "exec gueue reset detected\n"); + err = -EIO; + break; + } + } + + if (!timeout) { + err = -ETIME; + break; + } + + timeout = wait_woken(&w_wait, TASK_INTERRUPTIBLE, timeout); + } + remove_wait_queue(&xe->ufence_wq, &w_wait); + + if (!(args->flags & DRM_XE_UFENCE_WAIT_FLAG_ABSTIME)) { + args->timeout -= ktime_to_ns(ktime_sub(ktime_get(), start)); + if (args->timeout < 0) + args->timeout = 0; + } + + if (!timeout && !(err < 0)) + err = -ETIME; + + if (q) + xe_exec_queue_put(q); + + return err; +} diff --git a/drivers/gpu/drm/xe/xe_wait_user_fence.h b/drivers/gpu/drm/xe/xe_wait_user_fence.h new file mode 100644 index 000000000000..0e268978f9e6 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_wait_user_fence.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_WAIT_USER_FENCE_H_ +#define _XE_WAIT_USER_FENCE_H_ + +struct drm_device; +struct drm_file; + +int xe_wait_user_fence_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); + +#endif diff --git a/drivers/gpu/drm/xe/xe_wopcm.c b/drivers/gpu/drm/xe/xe_wopcm.c new file mode 100644 index 000000000000..d3a99157e523 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_wopcm.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +#include "xe_wopcm.h" + +#include "regs/xe_guc_regs.h" +#include "xe_device.h" +#include "xe_force_wake.h" +#include "xe_gt.h" +#include "xe_mmio.h" +#include "xe_uc_fw.h" + +/** + * DOC: Write Once Protected Content Memory (WOPCM) Layout + * + * The layout of the WOPCM will be fixed after writing to GuC WOPCM size and + * offset registers whose values are calculated and determined by HuC/GuC + * firmware size and set of hardware requirements/restrictions as shown below: + * + * :: + * + * +=========> +====================+ <== WOPCM Top + * ^ | HW contexts RSVD | + * | +===> +====================+ <== GuC WOPCM Top + * | ^ | | + * | | | | + * | | | | + * | GuC | | + * | WOPCM | | + * | Size +--------------------+ + * WOPCM | | GuC FW RSVD | + * | | +--------------------+ + * | | | GuC Stack RSVD | + * | | +------------------- + + * | v | GuC WOPCM RSVD | + * | +===> +====================+ <== GuC WOPCM base + * | | WOPCM RSVD | + * | +------------------- + <== HuC Firmware Top + * v | HuC FW | + * +=========> +====================+ <== WOPCM Base + * + * GuC accessible WOPCM starts at GuC WOPCM base and ends at GuC WOPCM top. + * The top part of the WOPCM is reserved for hardware contexts (e.g. RC6 + * context). + */ + +/* Default WOPCM size is 2MB from Gen11, 1MB on previous platforms */ +/* FIXME: Larger size require for 2 tile PVC, do a proper probe sooner or later */ +#define DGFX_WOPCM_SIZE SZ_4M +/* FIXME: Larger size require for MTL, do a proper probe sooner or later */ +#define MTL_WOPCM_SIZE SZ_4M +#define WOPCM_SIZE SZ_2M + +#define MAX_WOPCM_SIZE SZ_8M + +/* 16KB WOPCM (RSVD WOPCM) is reserved from HuC firmware top. */ +#define WOPCM_RESERVED_SIZE SZ_16K + +/* 16KB reserved at the beginning of GuC WOPCM. */ +#define GUC_WOPCM_RESERVED SZ_16K +/* 8KB from GUC_WOPCM_RESERVED is reserved for GuC stack. */ +#define GUC_WOPCM_STACK_RESERVED SZ_8K + +/* GuC WOPCM Offset value needs to be aligned to 16KB. */ +#define GUC_WOPCM_OFFSET_ALIGNMENT (1UL << GUC_WOPCM_OFFSET_SHIFT) + +/* 36KB WOPCM reserved at the end of WOPCM */ +#define WOPCM_HW_CTX_RESERVED (SZ_32K + SZ_4K) + +static inline struct xe_gt *wopcm_to_gt(struct xe_wopcm *wopcm) +{ + return container_of(wopcm, struct xe_gt, uc.wopcm); +} + +static inline struct xe_device *wopcm_to_xe(struct xe_wopcm *wopcm) +{ + return gt_to_xe(wopcm_to_gt(wopcm)); +} + +static u32 context_reserved_size(void) +{ + return WOPCM_HW_CTX_RESERVED; +} + +static bool __check_layout(struct xe_device *xe, u32 wopcm_size, + u32 guc_wopcm_base, u32 guc_wopcm_size, + u32 guc_fw_size, u32 huc_fw_size) +{ + const u32 ctx_rsvd = context_reserved_size(); + u32 size; + + size = wopcm_size - ctx_rsvd; + if (unlikely(guc_wopcm_base >= size || + guc_wopcm_size > size - guc_wopcm_base)) { + drm_err(&xe->drm, + "WOPCM: invalid GuC region layout: %uK + %uK > %uK\n", + guc_wopcm_base / SZ_1K, guc_wopcm_size / SZ_1K, + size / SZ_1K); + return false; + } + + size = guc_fw_size + GUC_WOPCM_RESERVED + GUC_WOPCM_STACK_RESERVED; + if (unlikely(guc_wopcm_size < size)) { + drm_err(&xe->drm, "WOPCM: no space for %s: %uK < %uK\n", + xe_uc_fw_type_repr(XE_UC_FW_TYPE_GUC), + guc_wopcm_size / SZ_1K, size / SZ_1K); + return false; + } + + size = huc_fw_size + WOPCM_RESERVED_SIZE; + if (unlikely(guc_wopcm_base < size)) { + drm_err(&xe->drm, "WOPCM: no space for %s: %uK < %uK\n", + xe_uc_fw_type_repr(XE_UC_FW_TYPE_HUC), + guc_wopcm_base / SZ_1K, size / SZ_1K); + return false; + } + + return true; +} + +static bool __wopcm_regs_locked(struct xe_gt *gt, + u32 *guc_wopcm_base, u32 *guc_wopcm_size) +{ + u32 reg_base = xe_mmio_read32(gt, DMA_GUC_WOPCM_OFFSET); + u32 reg_size = xe_mmio_read32(gt, GUC_WOPCM_SIZE); + + if (!(reg_size & GUC_WOPCM_SIZE_LOCKED) || + !(reg_base & GUC_WOPCM_OFFSET_VALID)) + return false; + + *guc_wopcm_base = reg_base & GUC_WOPCM_OFFSET_MASK; + *guc_wopcm_size = reg_size & GUC_WOPCM_SIZE_MASK; + return true; +} + +static int __wopcm_init_regs(struct xe_device *xe, struct xe_gt *gt, + struct xe_wopcm *wopcm) +{ + u32 base = wopcm->guc.base; + u32 size = wopcm->guc.size; + u32 huc_agent = xe_uc_fw_is_available(>->uc.huc.fw) ? HUC_LOADING_AGENT_GUC : 0; + u32 mask; + int err; + + XE_WARN_ON(!(base & GUC_WOPCM_OFFSET_MASK)); + XE_WARN_ON(base & ~GUC_WOPCM_OFFSET_MASK); + XE_WARN_ON(!(size & GUC_WOPCM_SIZE_MASK)); + XE_WARN_ON(size & ~GUC_WOPCM_SIZE_MASK); + + mask = GUC_WOPCM_SIZE_MASK | GUC_WOPCM_SIZE_LOCKED; + err = xe_mmio_write32_and_verify(gt, GUC_WOPCM_SIZE, size, mask, + size | GUC_WOPCM_SIZE_LOCKED); + if (err) + goto err_out; + + mask = GUC_WOPCM_OFFSET_MASK | GUC_WOPCM_OFFSET_VALID | huc_agent; + err = xe_mmio_write32_and_verify(gt, DMA_GUC_WOPCM_OFFSET, + base | huc_agent, mask, + base | huc_agent | + GUC_WOPCM_OFFSET_VALID); + if (err) + goto err_out; + + return 0; + +err_out: + drm_notice(&xe->drm, "Failed to init uC WOPCM registers!\n"); + drm_notice(&xe->drm, "%s(%#x)=%#x\n", "DMA_GUC_WOPCM_OFFSET", + DMA_GUC_WOPCM_OFFSET.addr, + xe_mmio_read32(gt, DMA_GUC_WOPCM_OFFSET)); + drm_notice(&xe->drm, "%s(%#x)=%#x\n", "GUC_WOPCM_SIZE", + GUC_WOPCM_SIZE.addr, + xe_mmio_read32(gt, GUC_WOPCM_SIZE)); + + return err; +} + +u32 xe_wopcm_size(struct xe_device *xe) +{ + return IS_DGFX(xe) ? DGFX_WOPCM_SIZE : + xe->info.platform == XE_METEORLAKE ? MTL_WOPCM_SIZE : + WOPCM_SIZE; +} + +/** + * xe_wopcm_init() - Initialize the WOPCM structure. + * @wopcm: pointer to xe_wopcm. + * + * This function will partition WOPCM space based on GuC and HuC firmware sizes + * and will allocate max remaining for use by GuC. This function will also + * enforce platform dependent hardware restrictions on GuC WOPCM offset and + * size. It will fail the WOPCM init if any of these checks fail, so that the + * following WOPCM registers setup and GuC firmware uploading would be aborted. + */ +int xe_wopcm_init(struct xe_wopcm *wopcm) +{ + struct xe_device *xe = wopcm_to_xe(wopcm); + struct xe_gt *gt = wopcm_to_gt(wopcm); + u32 guc_fw_size = xe_uc_fw_get_upload_size(>->uc.guc.fw); + u32 huc_fw_size = xe_uc_fw_get_upload_size(>->uc.huc.fw); + u32 ctx_rsvd = context_reserved_size(); + u32 guc_wopcm_base; + u32 guc_wopcm_size; + bool locked; + int ret = 0; + + if (!guc_fw_size) + return -EINVAL; + + wopcm->size = xe_wopcm_size(xe); + drm_dbg(&xe->drm, "WOPCM: %uK\n", wopcm->size / SZ_1K); + + xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + XE_WARN_ON(guc_fw_size >= wopcm->size); + XE_WARN_ON(huc_fw_size >= wopcm->size); + XE_WARN_ON(ctx_rsvd + WOPCM_RESERVED_SIZE >= wopcm->size); + + locked = __wopcm_regs_locked(gt, &guc_wopcm_base, &guc_wopcm_size); + if (locked) { + drm_dbg(&xe->drm, "GuC WOPCM is already locked [%uK, %uK)\n", + guc_wopcm_base / SZ_1K, guc_wopcm_size / SZ_1K); + /* + * When the GuC wopcm base and size are preprogrammed by + * BIOS/IFWI, check against the max allowed wopcm size to + * validate if the programmed values align to the wopcm layout. + */ + wopcm->size = MAX_WOPCM_SIZE; + + goto check; + } + + /* + * Aligned value of guc_wopcm_base will determine available WOPCM space + * for HuC firmware and mandatory reserved area. + */ + guc_wopcm_base = huc_fw_size + WOPCM_RESERVED_SIZE; + guc_wopcm_base = ALIGN(guc_wopcm_base, GUC_WOPCM_OFFSET_ALIGNMENT); + + /* + * Need to clamp guc_wopcm_base now to make sure the following math is + * correct. Formal check of whole WOPCM layout will be done below. + */ + guc_wopcm_base = min(guc_wopcm_base, wopcm->size - ctx_rsvd); + + /* Aligned remainings of usable WOPCM space can be assigned to GuC. */ + guc_wopcm_size = wopcm->size - ctx_rsvd - guc_wopcm_base; + guc_wopcm_size &= GUC_WOPCM_SIZE_MASK; + + drm_dbg(&xe->drm, "Calculated GuC WOPCM [%uK, %uK)\n", + guc_wopcm_base / SZ_1K, guc_wopcm_size / SZ_1K); + +check: + if (__check_layout(xe, wopcm->size, guc_wopcm_base, guc_wopcm_size, + guc_fw_size, huc_fw_size)) { + wopcm->guc.base = guc_wopcm_base; + wopcm->guc.size = guc_wopcm_size; + XE_WARN_ON(!wopcm->guc.base); + XE_WARN_ON(!wopcm->guc.size); + } else { + drm_notice(&xe->drm, "Unsuccessful WOPCM partitioning\n"); + return -E2BIG; + } + + if (!locked) + ret = __wopcm_init_regs(xe, gt, wopcm); + + return ret; +} diff --git a/drivers/gpu/drm/xe/xe_wopcm.h b/drivers/gpu/drm/xe/xe_wopcm.h new file mode 100644 index 000000000000..0197a282460b --- /dev/null +++ b/drivers/gpu/drm/xe/xe_wopcm.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_WOPCM_H_ +#define _XE_WOPCM_H_ + +#include "xe_wopcm_types.h" + +struct xe_device; + +int xe_wopcm_init(struct xe_wopcm *wopcm); +u32 xe_wopcm_size(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_wopcm_types.h b/drivers/gpu/drm/xe/xe_wopcm_types.h new file mode 100644 index 000000000000..486d850c4084 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_wopcm_types.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef _XE_WOPCM_TYPES_H_ +#define _XE_WOPCM_TYPES_H_ + +#include <linux/types.h> + +/** + * struct xe_wopcm - Overall WOPCM info and WOPCM regions. + */ +struct xe_wopcm { + /** @size: Size of overall WOPCM */ + u32 size; + /** @guc: GuC WOPCM Region info */ + struct { + /** @base: GuC WOPCM base which is offset from WOPCM base */ + u32 base; + /** @size: Size of the GuC WOPCM region */ + u32 size; + } guc; +}; + +#endif |