diff options
Diffstat (limited to 'drivers/misc/habanalabs/common/habanalabs.h')
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h | 280 |
1 files changed, 245 insertions, 35 deletions
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 6579f8767abd..6b3cdd7e068a 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -48,6 +48,7 @@ #define HL_PENDING_RESET_LONG_SEC 60 #define HL_HARD_RESET_MAX_TIMEOUT 120 +#define HL_PLDM_HARD_RESET_MAX_TIMEOUT (HL_HARD_RESET_MAX_TIMEOUT * 3) #define HL_DEVICE_TIMEOUT_USEC 1000000 /* 1 s */ @@ -115,10 +116,18 @@ enum hl_mmu_page_table_location { * * - HL_RESET_HEARTBEAT * Set if reset is due to heartbeat + * + * - HL_RESET_TDR + * Set if reset is due to TDR + * + * - HL_RESET_DEVICE_RELEASE + * Set if reset is due to device release */ #define HL_RESET_HARD (1 << 0) #define HL_RESET_FROM_RESET_THREAD (1 << 1) #define HL_RESET_HEARTBEAT (1 << 2) +#define HL_RESET_TDR (1 << 3) +#define HL_RESET_DEVICE_RELEASE (1 << 4) #define HL_MAX_SOBS_PER_MONITOR 8 @@ -178,12 +187,14 @@ enum hl_pci_match_mode { /** * enum hl_fw_component - F/W components to read version through registers. - * @FW_COMP_UBOOT: u-boot. + * @FW_COMP_BOOT_FIT: boot fit. * @FW_COMP_PREBOOT: preboot. + * @FW_COMP_LINUX: linux. */ enum hl_fw_component { - FW_COMP_UBOOT, - FW_COMP_PREBOOT + FW_COMP_BOOT_FIT, + FW_COMP_PREBOOT, + FW_COMP_LINUX, }; /** @@ -420,12 +431,24 @@ struct hl_mmu_properties { * @cb_pool_cb_size: size of each CB in the CB pool. * @max_pending_cs: maximum of concurrent pending command submissions * @max_queues: maximum amount of queues in the system - * @fw_boot_cpu_security_map: bitmap representation of boot cpu security status - * reported by FW, bit description can be found in - * CPU_BOOT_DEV_STS* - * @fw_app_security_map: bitmap representation of application security status - * reported by FW, bit description can be found in - * CPU_BOOT_DEV_STS* + * @fw_preboot_cpu_boot_dev_sts0: bitmap representation of preboot cpu + * capabilities reported by FW, bit description + * can be found in CPU_BOOT_DEV_STS0 + * @fw_preboot_cpu_boot_dev_sts1: bitmap representation of preboot cpu + * capabilities reported by FW, bit description + * can be found in CPU_BOOT_DEV_STS1 + * @fw_bootfit_cpu_boot_dev_sts0: bitmap representation of boot cpu security + * status reported by FW, bit description can be + * found in CPU_BOOT_DEV_STS0 + * @fw_bootfit_cpu_boot_dev_sts1: bitmap representation of boot cpu security + * status reported by FW, bit description can be + * found in CPU_BOOT_DEV_STS1 + * @fw_app_cpu_boot_dev_sts0: bitmap representation of application security + * status reported by FW, bit description can be + * found in CPU_BOOT_DEV_STS0 + * @fw_app_cpu_boot_dev_sts1: bitmap representation of application security + * status reported by FW, bit description can be + * found in CPU_BOOT_DEV_STS1 * @collective_first_sob: first sync object available for collective use * @collective_first_mon: first monitor available for collective use * @sync_stream_first_sob: first sync object available for sync stream use @@ -438,14 +461,19 @@ struct hl_mmu_properties { * @user_interrupt_count: number of user interrupts. * @tpc_enabled_mask: which TPCs are enabled. * @completion_queues_count: number of completion queues. - * @fw_security_disabled: true if security measures are disabled in firmware, - * false otherwise - * @fw_security_status_valid: security status bits are valid and can be fetched - * from BOOT_DEV_STS0 + * @fw_security_enabled: true if security measures are enabled in firmware, + * false otherwise + * @fw_cpu_boot_dev_sts0_valid: status bits are valid and can be fetched from + * BOOT_DEV_STS0 + * @fw_cpu_boot_dev_sts1_valid: status bits are valid and can be fetched from + * BOOT_DEV_STS1 * @dram_supports_virtual_memory: is there an MMU towards the DRAM * @hard_reset_done_by_fw: true if firmware is handling hard reset flow * @num_functional_hbms: number of functional HBMs in each DCORE. * @iatu_done_by_fw: true if iATU configuration is being done by FW. + * @dynamic_fw_load: is dynamic FW load is supported. + * @gic_interrupts_enable: true if FW is not blocking GIC controller, + * false otherwise. */ struct asic_fixed_properties { struct hw_queue_properties *hw_queues_props; @@ -491,8 +519,12 @@ struct asic_fixed_properties { u32 cb_pool_cb_size; u32 max_pending_cs; u32 max_queues; - u32 fw_boot_cpu_security_map; - u32 fw_app_security_map; + u32 fw_preboot_cpu_boot_dev_sts0; + u32 fw_preboot_cpu_boot_dev_sts1; + u32 fw_bootfit_cpu_boot_dev_sts0; + u32 fw_bootfit_cpu_boot_dev_sts1; + u32 fw_app_cpu_boot_dev_sts0; + u32 fw_app_cpu_boot_dev_sts1; u16 collective_first_sob; u16 collective_first_mon; u16 sync_stream_first_sob; @@ -504,12 +536,15 @@ struct asic_fixed_properties { u16 user_interrupt_count; u8 tpc_enabled_mask; u8 completion_queues_count; - u8 fw_security_disabled; - u8 fw_security_status_valid; + u8 fw_security_enabled; + u8 fw_cpu_boot_dev_sts0_valid; + u8 fw_cpu_boot_dev_sts1_valid; u8 dram_supports_virtual_memory; u8 hard_reset_done_by_fw; u8 num_functional_hbms; u8 iatu_done_by_fw; + u8 dynamic_fw_load; + u8 gic_interrupts_enable; }; /** @@ -750,12 +785,19 @@ struct hl_user_pending_interrupt { * @kernel_address: holds the queue's kernel virtual address * @bus_address: holds the queue's DMA address * @ci: ci inside the queue + * @prev_eqe_index: the index of the previous event queue entry. The index of + * the current entry's index must be +1 of the previous one. + * @check_eqe_index: do we need to check the index of the current entry vs. the + * previous one. This is for backward compatibility with older + * firmwares */ struct hl_eq { struct hl_device *hdev; void *kernel_address; dma_addr_t bus_address; u32 ci; + u32 prev_eqe_index; + bool check_eqe_index; }; @@ -812,6 +854,132 @@ enum div_select_defs { DIV_SEL_DIVIDED_PLL = 3, }; +enum pci_region { + PCI_REGION_CFG, + PCI_REGION_SRAM, + PCI_REGION_DRAM, + PCI_REGION_SP_SRAM, + PCI_REGION_NUMBER, +}; + +/** + * struct pci_mem_region - describe memory region in a PCI bar + * @region_base: region base address + * @region_size: region size + * @bar_size: size of the BAR + * @offset_in_bar: region offset into the bar + * @bar_id: bar ID of the region + * @used: if used 1, otherwise 0 + */ +struct pci_mem_region { + u64 region_base; + u64 region_size; + u64 bar_size; + u32 offset_in_bar; + u8 bar_id; + u8 used; +}; + +/** + * struct static_fw_load_mgr - static FW load manager + * @preboot_version_max_off: max offset to preboot version + * @boot_fit_version_max_off: max offset to boot fit version + * @kmd_msg_to_cpu_reg: register address for KDM->CPU messages + * @cpu_cmd_status_to_host_reg: register address for CPU command status response + * @cpu_boot_status_reg: boot status register + * @cpu_boot_dev_status0_reg: boot device status register 0 + * @cpu_boot_dev_status1_reg: boot device status register 1 + * @boot_err0_reg: boot error register 0 + * @boot_err1_reg: boot error register 1 + * @preboot_version_offset_reg: SRAM offset to preboot version register + * @boot_fit_version_offset_reg: SRAM offset to boot fit version register + * @sram_offset_mask: mask for getting offset into the SRAM + * @cpu_reset_wait_msec: used when setting WFE via kmd_msg_to_cpu_reg + */ +struct static_fw_load_mgr { + u64 preboot_version_max_off; + u64 boot_fit_version_max_off; + u32 kmd_msg_to_cpu_reg; + u32 cpu_cmd_status_to_host_reg; + u32 cpu_boot_status_reg; + u32 cpu_boot_dev_status0_reg; + u32 cpu_boot_dev_status1_reg; + u32 boot_err0_reg; + u32 boot_err1_reg; + u32 preboot_version_offset_reg; + u32 boot_fit_version_offset_reg; + u32 sram_offset_mask; + u32 cpu_reset_wait_msec; +}; + +/** + * struct fw_response - FW response to LKD command + * @ram_offset: descriptor offset into the RAM + * @ram_type: RAM type containing the descriptor (SRAM/DRAM) + * @status: command status + */ +struct fw_response { + u32 ram_offset; + u8 ram_type; + u8 status; +}; + +/** + * struct dynamic_fw_load_mgr - dynamic FW load manager + * @response: FW to LKD response + * @comm_desc: the communication descriptor with FW + * @image_region: region to copy the FW image to + * @fw_image_size: size of FW image to load + * @wait_for_bl_timeout: timeout for waiting for boot loader to respond + */ +struct dynamic_fw_load_mgr { + struct fw_response response; + struct lkd_fw_comms_desc comm_desc; + struct pci_mem_region *image_region; + size_t fw_image_size; + u32 wait_for_bl_timeout; +}; + +/** + * struct fw_image_props - properties of FW image + * @image_name: name of the image + * @src_off: offset in src FW to copy from + * @copy_size: amount of bytes to copy (0 to copy the whole binary) + */ +struct fw_image_props { + char *image_name; + u32 src_off; + u32 copy_size; +}; + +/** + * struct fw_load_mgr - manager FW loading process + * @dynamic_loader: specific structure for dynamic load + * @static_loader: specific structure for static load + * @boot_fit_img: boot fit image properties + * @linux_img: linux image properties + * @cpu_timeout: CPU response timeout in usec + * @boot_fit_timeout: Boot fit load timeout in usec + * @skip_bmc: should BMC be skipped + * @sram_bar_id: SRAM bar ID + * @dram_bar_id: DRAM bar ID + * @linux_loaded: true if linux was loaded so far + */ +struct fw_load_mgr { + union { + struct dynamic_fw_load_mgr dynamic_loader; + struct static_fw_load_mgr static_loader; + }; + struct fw_image_props boot_fit_img; + struct fw_image_props linux_img; + u32 cpu_timeout; + u32 boot_fit_timeout; + u8 skip_bmc; + u8 sram_bar_id; + u8 dram_bar_id; + u8 linux_loaded; +}; + /** * struct hl_asic_funcs - ASIC specific functions that are can be called from * common code. @@ -901,8 +1069,6 @@ enum div_select_defs { * @ctx_fini: context dependent cleanup. * @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz * @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index. - * @read_device_fw_version: read the device's firmware versions that are - * contained in registers * @load_firmware_to_device: load the firmware to the device's memory * @load_boot_fit_to_device: load boot fit to device's memory * @get_signal_cb_size: Get signal CB size. @@ -933,6 +1099,8 @@ enum div_select_defs { * @get_msi_info: Retrieve asic-specific MSI ID of the f/w async event * @map_pll_idx_to_fw_idx: convert driver specific per asic PLL index to * generic f/w compatible PLL Indexes + * @init_firmware_loader: initialize data for FW loader. + * @init_cpu_scrambler_dram: Enable CPU specific DRAM scrambling */ struct hl_asic_funcs { int (*early_init)(struct hl_device *hdev); @@ -1006,7 +1174,7 @@ struct hl_asic_funcs { int (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard, u32 flags); int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard, - u32 asid, u64 va, u64 size); + u32 flags, u32 asid, u64 va, u64 size); int (*send_heartbeat)(struct hl_device *hdev); void (*set_clock_gating)(struct hl_device *hdev); void (*disable_clock_gating)(struct hl_device *hdev); @@ -1030,8 +1198,6 @@ struct hl_asic_funcs { void (*ctx_fini)(struct hl_ctx *ctx); int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk); u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx); - int (*read_device_fw_version)(struct hl_device *hdev, - enum hl_fw_component fwc); int (*load_firmware_to_device)(struct hl_device *hdev); int (*load_boot_fit_to_device)(struct hl_device *hdev); u32 (*get_signal_cb_size)(struct hl_device *hdev); @@ -1056,8 +1222,10 @@ struct hl_asic_funcs { int (*hw_block_mmap)(struct hl_device *hdev, struct vm_area_struct *vma, u32 block_id, u32 block_size); void (*enable_events_from_fw)(struct hl_device *hdev); - void (*get_msi_info)(u32 *table); + void (*get_msi_info)(__le32 *table); int (*map_pll_idx_to_fw_idx)(u32 pll_idx); + void (*init_firmware_loader)(struct hl_device *hdev); + void (*init_cpu_scrambler_dram)(struct hl_device *hdev); }; @@ -1262,6 +1430,7 @@ struct hl_userptr { * @staged_sequence: the sequence of the staged submission this CS is part of, * relevant only if staged_cs is set. * @timeout_jiffies: cs timeout in jiffies. + * @submission_time_jiffies: submission time of the cs * @type: CS_TYPE_*. * @submitted: true if CS was submitted to H/W. * @completed: true if CS was completed by device. @@ -1274,6 +1443,8 @@ struct hl_userptr { * @staged_first: true if this is the first staged CS and we need to receive * timeout for this CS. * @staged_cs: true if this CS is part of a staged submission. + * @skip_reset_on_timeout: true if we shall not reset the device in case + * timeout occurs (debug scenario). */ struct hl_cs { u16 *jobs_in_queue_cnt; @@ -1291,6 +1462,7 @@ struct hl_cs { u64 sequence; u64 staged_sequence; u64 timeout_jiffies; + u64 submission_time_jiffies; enum hl_cs_type type; u8 submitted; u8 completed; @@ -1301,6 +1473,7 @@ struct hl_cs { u8 staged_last; u8 staged_first; u8 staged_cs; + u8 skip_reset_on_timeout; }; /** @@ -1922,7 +2095,7 @@ struct hl_mmu_funcs { * @kernel_queues: array of hl_hw_queue. * @cs_mirror_list: CS mirror list for TDR. * @cs_mirror_lock: protects cs_mirror_list. - * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs. + * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CBs. * @event_queue: event queue for IRQ from CPU-CP. * @dma_pool: DMA pool for small allocations. * @cpu_accessible_dma_mem: Host <-> CPU-CP shared memory CPU address. @@ -1954,6 +2127,8 @@ struct hl_mmu_funcs { * @aggregated_cs_counters: aggregated cs counters among all contexts * @mmu_priv: device-specific MMU data. * @mmu_func: device-related MMU functions. + * @fw_loader: FW loader manager. + * @pci_mem_region: array of memory regions in the PCI * @dram_used_mem: current DRAM memory consumption. * @timeout_jiffies: device CS timeout value. * @max_power: the max power of the device, as configured by the sysadmin. This @@ -1968,6 +2143,11 @@ struct hl_mmu_funcs { * the error will be ignored by the driver during * device initialization. Mainly used to debug and * workaround firmware bugs + * @last_successful_open_jif: timestamp (jiffies) of the last successful + * device open. + * @last_open_session_duration_jif: duration (jiffies) of the last device open + * session. + * @open_counter: number of successful device open operations. * @in_reset: is device in reset flow. * @curr_pll_profile: current PLL profile. * @card_type: Various ASICs have several card types. This indicates the card @@ -2007,6 +2187,8 @@ struct hl_mmu_funcs { * @collective_mon_idx: helper index for collective initialization * @supports_coresight: is CoreSight supported. * @supports_soft_reset: is soft reset supported. + * @allow_external_soft_reset: true if soft reset initiated by user or TDR is + * allowed. * @supports_cb_mapping: is mapping a CB to the device's MMU supported. * @needs_reset: true if reset_on_lockup is false and device should be reset * due to lockup. @@ -2015,6 +2197,14 @@ struct hl_mmu_funcs { * @device_fini_pending: true if device_fini was called and might be * waiting for the reset thread to finish * @supports_staged_submission: true if staged submissions are supported + * @curr_reset_cause: saves an enumerated reset cause when a hard reset is + * triggered, and cleared after it is shared with preboot. + * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to + * complete instead. + * @device_cpu_is_halted: Flag to indicate whether the device CPU was already + * halted. We can't halt it again because the COMMS + * protocol will throw an error. Relevant only for + * cases where Linux was not loaded to device CPU */ struct hl_device { struct pci_dev *pdev; @@ -2079,11 +2269,18 @@ struct hl_device { struct hl_mmu_priv mmu_priv; struct hl_mmu_funcs mmu_func[MMU_NUM_PGT_LOCATIONS]; + struct fw_load_mgr fw_loader; + + struct pci_mem_region pci_mem_region[PCI_REGION_NUMBER]; + atomic64_t dram_used_mem; u64 timeout_jiffies; u64 max_power; u64 clock_gating_mask; u64 boot_error_status_mask; + u64 last_successful_open_jif; + u64 last_open_session_duration_jif; + u64 open_counter; atomic_t in_reset; enum hl_pll_frequency curr_pll_profile; enum cpucp_card_types card_type; @@ -2116,11 +2313,15 @@ struct hl_device { u8 collective_mon_idx; u8 supports_coresight; u8 supports_soft_reset; + u8 allow_external_soft_reset; u8 supports_cb_mapping; u8 needs_reset; u8 process_kill_trial_cnt; u8 device_fini_pending; u8 supports_staged_submission; + u8 curr_reset_cause; + u8 skip_reset_on_timeout; + u8 device_cpu_is_halted; /* Parameters for bring-up */ u64 nic_ports_mask; @@ -2138,6 +2339,7 @@ struct hl_device { u8 rl_enable; u8 reset_on_preboot_fail; u8 reset_upon_device_release; + u8 reset_if_device_not_idle; }; @@ -2384,11 +2586,13 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, void *vaddr); int hl_fw_send_heartbeat(struct hl_device *hdev); int hl_fw_cpucp_info_get(struct hl_device *hdev, - u32 cpu_security_boot_status_reg, - u32 boot_err0_reg); + u32 sts_boot_dev_sts0_reg, + u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg, + u32 boot_err1_reg); int hl_fw_cpucp_handshake(struct hl_device *hdev, - u32 cpu_security_boot_status_reg, - u32 boot_err0_reg); + u32 sts_boot_dev_sts0_reg, + u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg, + u32 boot_err1_reg); int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size); int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, struct hl_info_pci_counters *counters); @@ -2399,14 +2603,17 @@ int get_used_pll_index(struct hl_device *hdev, u32 input_pll_index, int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index, u16 *pll_freq_arr); int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power); -int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, - u32 msg_to_cpu_reg, u32 cpu_msg_status_reg, - u32 cpu_security_boot_status_reg, u32 boot_err0_reg, - bool skip_bmc, u32 cpu_timeout, u32 boot_fit_timeout); +void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev); +void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev); +int hl_fw_init_cpu(struct hl_device *hdev); int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg, - u32 cpu_security_boot_status_reg, u32 boot_err0_reg, - u32 timeout); - + u32 sts_boot_dev_sts0_reg, + u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg, + u32 boot_err1_reg, u32 timeout); +int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, + enum comms_cmd cmd, unsigned int size, + bool wait_ok, u32 timeout); int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], bool is_wc[3]); int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data); @@ -2415,6 +2622,7 @@ int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region, struct hl_inbound_pci_region *pci_region); int hl_pci_set_outbound_region(struct hl_device *hdev, struct hl_outbound_pci_region *pci_region); +enum pci_region hl_get_pci_memory_region(struct hl_device *hdev, u64 addr); int hl_pci_init(struct hl_device *hdev); void hl_pci_fini(struct hl_device *hdev); @@ -2443,6 +2651,8 @@ int hl_set_voltage(struct hl_device *hdev, int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value); void hl_release_pending_user_interrupts(struct hl_device *hdev); +int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx, + struct hl_hw_sob **hw_sob, u32 count); #ifdef CONFIG_DEBUG_FS |