summaryrefslogtreecommitdiff
path: root/include/uapi
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2023-03-22 03:35:45 +0300
committerDave Airlie <airlied@redhat.com>2023-03-22 03:35:46 +0300
commitd36d68fd1925d33066d52468b7c7c6aca6521248 (patch)
tree7c5db4e1e9e05d5aee34c2786aea3f5222e6521c /include/uapi
parentd240daa2c40d384aa01d68163ce5c12625b92d10 (diff)
parent75b445753047872a69709cfba7e3939660f0ecc1 (diff)
downloadlinux-d36d68fd1925d33066d52468b7c7c6aca6521248.tar.xz
Merge tag 'drm-habanalabs-next-2023-03-20' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next
This tag contains habanalabs driver and accel changes for v6.4: - uAPI changes: - Add opcodes to the CS ioctl to allow user to stall/resume specific engines inside Gaudi2. This is to allow the user to perform power testing/measurements when training different topologies. - Expose in the INFO ioctl the amount of device memory that the driver and f/w reserve for themselves. - Expose in the INFO ioctl a bit-mask of the available rotator engines in Gaudi2. This is to align with other engines that are already exposed. - Expose in the INFO ioctl the register's address of the f/w that should be used to trigger interrupts from within the user's code running in the compute engines. - Add a critical-event bit in the eventfd bitmask so the user will know the event that was received was critical, and a reset will now occur - Expose in the INFO ioctl two new opcodes to fetch information on h/w and f/w events. The events recorded are the events that were reported in the eventfd. - New features and improvements: - Add a dedicated interrupt ID in MSI-X in the device to the notification of an unexpected user-related event in Gaudi2. Handle it in the driver by reporting this event. - Allow the user to fetch the device memory current usage even when the device is undergoing compute-reset (a reset type that only clears the compute engines). - Enable graceful reset mechanism for compute-reset. This will give the user a few seconds before the device is reset. For example, the user can, during that time, perform certain device operations (dump data for debug) or close the device in an orderly fashion. - Align the decoder with the rest of the engines in regard to notification to the user about interrupts and in regard to performing graceful reset when needed (instead of immediate reset). - Add support for assert interrupt from the TPC engine. - Get the reset type that is necessary to perform per event from the auto-generated irq_map array. - Print the specific reason why a device is still in use when notifying to the user about it (after the user closed the device's FD). - Move to threaded IRQ when handling interrupts of workload completions. - Firmware related fixes: - Fix RAZWI event handler to match newest f/w version. - Read error cause register in dma core events because the f/w doesn't do that. - Increase maximum time to wait for completion of Gaudi2 reset due to f/w bug. - Align to the latest firmware specs. - Enforce the release order of the compute device and dma-buf. i.e increment the device file refcount for any dma-buf that was exported for that device. This will make sure the compute device release function won't be called until the user closes all the FDs of the relevant dma-bufs. Without this change, closing the device's FD before/without closing the dma-buf's FD would always lead to hard-reset of the device. - Fix a link in the drm documentation to correctly point to the accel section. - Compilation warnings cleanups - Misc bug fixes and code cleanups Signed-off-by: Dave Airlie <airlied@redhat.com> # -----BEGIN PGP SIGNATURE----- # # iQEzBAABCgAdFiEE7TEboABC71LctBLFZR1NuKta54AFAmQYfcAACgkQZR1NuKta # 54DB4Af/SuiHZkVXwr+yHPv9El726rz9ZQD7mQtzNmehWGonwAvz15yqocNMUSbF # JbqE/vrZjvbXrP1Uv5UrlRVdnFHSPV18VnHU4BMS/WOm19SsR6vZ0QOXOoa6/AUb # w+kF3D//DbFI4/mTGfpH5/pzwu51ti8aVktosPFlHIa8iI8CB4/4IV+ivQ8UW4oK # HyDRkIvHdRmER7vGOfhwhsr4zdqSlJBYrv3C3Z1dkSYBPW/5ICbiM1UlKycwdYKI # cajQBSdUQwUCWnI+i8RmSy3kjNO6OE4XRUvTv89F2bQeyK/1rJLG2m2xZR/Ml/o5 # 7Cgvbn0hWZyeqe7OObYiBlSOBSehCA== # =wclm # -----END PGP SIGNATURE----- # gpg: Signature made Tue 21 Mar 2023 01:37:36 AEST # gpg: using RSA key ED311BA00042EF52DCB412C5651D4DB8AB5AE780 # gpg: Can't check signature: No public key From: Oded Gabbay <ogabbay@kernel.org> Link: https://patchwork.freedesktop.org/patch/msgid/20230320154026.GA766126@ogabbay-vm-u20.habana-labs.com
Diffstat (limited to 'include/uapi')
-rw-r--r--include/uapi/drm/habanalabs_accel.h102
1 files changed, 91 insertions, 11 deletions
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index 331567ec9e79..c139aab17c8a 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -723,6 +723,10 @@ enum hl_server_type {
* HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error
* HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened
* HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened
+ * HL_NOTIFIER_EVENT_CRITICAL_HW_ERR - Indicates a HW error that requires SW abort and
+ * HW reset
+ * HL_NOTIFIER_EVENT_CRITICAL_FW_ERR - Indicates a FW error that requires SW abort and
+ * HW reset
*/
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0)
#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1)
@@ -733,6 +737,8 @@ enum hl_server_type {
#define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6)
#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7)
#define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8)
+#define HL_NOTIFIER_EVENT_CRITICL_HW_ERR (1ULL << 9)
+#define HL_NOTIFIER_EVENT_CRITICL_FW_ERR (1ULL << 10)
/* Opcode for management ioctl
*
@@ -790,6 +796,8 @@ enum hl_server_type {
* HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
* HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
* HL_INFO_FW_GENERIC_REQ - Send generic request to FW.
+ * HL_INFO_HW_ERR_EVENT - Retrieve information on the reported HW error.
+ * HL_INFO_FW_ERR_EVENT - Retrieve information on the reported FW error.
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@@ -824,6 +832,8 @@ enum hl_server_type {
#define HL_INFO_PAGE_FAULT_EVENT 33
#define HL_INFO_USER_MAPPINGS 34
#define HL_INFO_FW_GENERIC_REQ 35
+#define HL_INFO_HW_ERR_EVENT 36
+#define HL_INFO_FW_ERR_EVENT 37
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@@ -875,6 +885,12 @@ enum hl_server_type {
* application to use. Relevant for Gaudi2 and later.
* @device_mem_alloc_default_page_size: default page size used in device memory allocation.
* @revision_id: PCI revision ID of the ASIC.
+ * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
+ * @rotator_enabled_mask: Bit-mask that represents which rotators are enabled.
+ * Relevant for Gaudi3 and later.
+ * @engine_core_interrupt_reg_addr: interrupt register address for engine core to use
+ * in order to raise events toward FW.
+ * @reserved_dram_size: DRAM size reserved for driver and firmware.
*/
struct hl_info_hw_ip_info {
__u64 sram_base_address;
@@ -902,15 +918,20 @@ struct hl_info_hw_ip_info {
__u64 dram_page_size;
__u32 edma_enabled_mask;
__u16 number_of_user_interrupts;
- __u16 pad2;
- __u64 reserved4;
+ __u8 reserved1;
+ __u8 reserved2;
+ __u64 reserved3;
__u64 device_mem_alloc_default_page_size;
+ __u64 reserved4;
__u64 reserved5;
- __u64 reserved6;
- __u32 reserved7;
- __u8 reserved8;
+ __u32 reserved6;
+ __u8 reserved7;
__u8 revision_id;
- __u8 pad[2];
+ __u16 tpc_interrupt_id;
+ __u32 rotator_enabled_mask;
+ __u32 reserved9;
+ __u64 engine_core_interrupt_reg_addr;
+ __u64 reserved_dram_size;
};
struct hl_info_dram_usage {
@@ -1162,6 +1183,39 @@ struct hl_info_undefined_opcode_event {
};
/**
+ * struct hl_info_hw_err_event - info about HW error
+ * @timestamp: timestamp of error occurrence
+ * @event_id: The async event ID (specific to each device type).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_hw_err_event {
+ __s64 timestamp;
+ __u16 event_id;
+ __u16 pad[3];
+};
+
+/* FW error definition for event_type in struct hl_info_fw_err_event */
+enum hl_info_fw_err_type {
+ HL_INFO_FW_HEARTBEAT_ERR,
+ HL_INFO_FW_REPORTED_ERR,
+};
+
+/**
+ * struct hl_info_fw_err_event - info about FW error
+ * @timestamp: time-stamp of error occurrence
+ * @err_type: The type of event as defined in hl_info_fw_err_type.
+ * @event_id: The async event ID (specific to each device type, applicable only when event type is
+ * HL_INFO_FW_REPORTED_ERR).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_fw_err_event {
+ __s64 timestamp;
+ __u16 err_type;
+ __u16 event_id;
+ __u32 pad;
+};
+
+/**
* struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
* @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size
* (e.g. 0x2100000 means that 1MB and 32MB pages are supported).
@@ -1486,17 +1540,31 @@ struct hl_cs_chunk {
*/
#define HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES 0x8000
+/*
+ * The engines CS is merged into the existing CS ioctls.
+ * Use it to control engines modes.
+ */
+#define HL_CS_FLAGS_ENGINES_COMMAND 0x10000
+
#define HL_CS_STATUS_SUCCESS 0
#define HL_MAX_JOBS_PER_CS 512
-/* HL_ENGINE_CORE_ values
+/*
+ * enum hl_engine_command - engine command
*
- * HL_ENGINE_CORE_HALT: engine core halt
- * HL_ENGINE_CORE_RUN: engine core run
+ * @HL_ENGINE_CORE_HALT: engine core halt
+ * @HL_ENGINE_CORE_RUN: engine core run
+ * @HL_ENGINE_STALL: user engine/s stall
+ * @HL_ENGINE_RESUME: user engine/s resume
*/
-#define HL_ENGINE_CORE_HALT (1 << 0)
-#define HL_ENGINE_CORE_RUN (1 << 1)
+enum hl_engine_command {
+ HL_ENGINE_CORE_HALT = 1,
+ HL_ENGINE_CORE_RUN = 2,
+ HL_ENGINE_STALL = 3,
+ HL_ENGINE_RESUME = 4,
+ HL_ENGINE_COMMAND_MAX
+};
struct hl_cs_in {
@@ -1520,6 +1588,18 @@ struct hl_cs_in {
/* the core command to be sent towards engine cores */
__u32 core_command;
};
+
+ /* Valid only when HL_CS_FLAGS_ENGINES_COMMAND is set */
+ struct {
+ /* this holds address of array of uint32 for engines */
+ __u64 engines;
+
+ /* number of engines in engines array */
+ __u32 num_engines;
+
+ /* the engine command to be sent towards engines */
+ __u32 engine_command;
+ };
};
union {