diff options
Diffstat (limited to 'drivers/misc/habanalabs/include/common/cpucp_if.h')
-rw-r--r-- | drivers/misc/habanalabs/include/common/cpucp_if.h | 297 |
1 files changed, 279 insertions, 18 deletions
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h index 38e44b6cf581..abf40e1c4965 100644 --- a/drivers/misc/habanalabs/include/common/cpucp_if.h +++ b/drivers/misc/habanalabs/include/common/cpucp_if.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2020-2021 HabanaLabs, Ltd. + * Copyright 2020-2022 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -68,7 +68,8 @@ struct hl_eq_ecc_data { __le64 ecc_address; __le64 ecc_syndrom; __u8 memory_wrapper_idx; - __u8 pad[7]; + __u8 is_critical; + __u8 pad[6]; }; enum hl_sm_sei_cause { @@ -98,27 +99,265 @@ struct hl_eq_fw_alive { __u8 pad[7]; }; -enum hl_pcie_addr_dec_cause { - PCIE_ADDR_DEC_HBW_ERR_RESP, - PCIE_ADDR_DEC_LBW_ERR_RESP, - PCIE_ADDR_DEC_TLP_BLOCKED_BY_RR +struct hl_eq_intr_cause { + __le64 intr_cause_data; }; -struct hl_eq_pcie_addr_dec_data { - /* enum hl_pcie_addr_dec_cause */ - __u8 addr_dec_cause; - __u8 pad[7]; +struct hl_eq_pcie_drain_ind_data { + struct hl_eq_intr_cause intr_cause; + __le64 drain_wr_addr_lbw; + __le64 drain_rd_addr_lbw; + __le64 drain_wr_addr_hbw; + __le64 drain_rd_addr_hbw; +}; + +struct hl_eq_razwi_lbw_info_regs { + __le32 rr_aw_razwi_reg; + __le32 rr_aw_razwi_id_reg; + __le32 rr_ar_razwi_reg; + __le32 rr_ar_razwi_id_reg; +}; + +struct hl_eq_razwi_hbw_info_regs { + __le32 rr_aw_razwi_hi_reg; + __le32 rr_aw_razwi_lo_reg; + __le32 rr_aw_razwi_id_reg; + __le32 rr_ar_razwi_hi_reg; + __le32 rr_ar_razwi_lo_reg; + __le32 rr_ar_razwi_id_reg; +}; + +/* razwi_happened masks */ +#define RAZWI_HAPPENED_HBW 0x1 +#define RAZWI_HAPPENED_LBW 0x2 +#define RAZWI_HAPPENED_AW 0x4 +#define RAZWI_HAPPENED_AR 0x8 + +struct hl_eq_razwi_info { + __le32 razwi_happened_mask; + union { + struct hl_eq_razwi_lbw_info_regs lbw; + struct hl_eq_razwi_hbw_info_regs hbw; + }; + __le32 pad; +}; + +struct hl_eq_razwi_with_intr_cause { + struct hl_eq_razwi_info razwi_info; + struct hl_eq_intr_cause intr_cause; +}; + +#define HBM_CA_ERR_CMD_LIFO_LEN 8 +#define HBM_RD_ERR_DATA_LIFO_LEN 8 +#define HBM_WR_PAR_CMD_LIFO_LEN 11 + +enum hl_hbm_sei_cause { + /* Command/address parity error event is split into 2 events due to + * size limitation: ODD suffix for odd HBM CK_t cycles and EVEN suffix + * for even HBM CK_t cycles + */ + HBM_SEI_CMD_PARITY_EVEN, + HBM_SEI_CMD_PARITY_ODD, + /* Read errors can be reflected as a combination of SERR/DERR/parity + * errors. Therefore, we define one event for all read error types. + * LKD will perform further proccessing. + */ + HBM_SEI_READ_ERR, + HBM_SEI_WRITE_DATA_PARITY_ERR, + HBM_SEI_CATTRIP, + HBM_SEI_MEM_BIST_FAIL, + HBM_SEI_DFI, + HBM_SEI_INV_TEMP_READ_OUT, + HBM_SEI_BIST_FAIL, +}; + +/* Masks for parsing hl_hbm_sei_headr fields */ +#define HBM_ECC_SERR_CNTR_MASK 0xFF +#define HBM_ECC_DERR_CNTR_MASK 0xFF00 +#define HBM_RD_PARITY_CNTR_MASK 0xFF0000 + +/* HBM index and MC index are known by the event_id */ +struct hl_hbm_sei_header { + union { + /* relevant only in case of HBM read error */ + struct { + __u8 ecc_serr_cnt; + __u8 ecc_derr_cnt; + __u8 read_par_cnt; + __u8 reserved; + }; + /* All other cases */ + __le32 cnt; + }; + __u8 sei_cause; /* enum hl_hbm_sei_cause */ + __u8 mc_channel; /* range: 0-3 */ + __u8 mc_pseudo_channel; /* range: 0-7 */ + __u8 is_critical; +}; + +#define HBM_RD_ADDR_SID_SHIFT 0 +#define HBM_RD_ADDR_SID_MASK 0x1 +#define HBM_RD_ADDR_BG_SHIFT 1 +#define HBM_RD_ADDR_BG_MASK 0x6 +#define HBM_RD_ADDR_BA_SHIFT 3 +#define HBM_RD_ADDR_BA_MASK 0x18 +#define HBM_RD_ADDR_COL_SHIFT 5 +#define HBM_RD_ADDR_COL_MASK 0x7E0 +#define HBM_RD_ADDR_ROW_SHIFT 11 +#define HBM_RD_ADDR_ROW_MASK 0x3FFF800 + +struct hbm_rd_addr { + union { + /* bit fields are only for FW use */ + struct { + u32 dbg_rd_err_addr_sid:1; + u32 dbg_rd_err_addr_bg:2; + u32 dbg_rd_err_addr_ba:2; + u32 dbg_rd_err_addr_col:6; + u32 dbg_rd_err_addr_row:15; + u32 reserved:6; + }; + __le32 rd_addr_val; + }; +}; + +#define HBM_RD_ERR_BEAT_SHIFT 2 +/* dbg_rd_err_misc fields: */ +/* Read parity is calculated per DW on every beat */ +#define HBM_RD_ERR_PAR_ERR_BEAT0_SHIFT 0 +#define HBM_RD_ERR_PAR_ERR_BEAT0_MASK 0x3 +#define HBM_RD_ERR_PAR_DATA_BEAT0_SHIFT 8 +#define HBM_RD_ERR_PAR_DATA_BEAT0_MASK 0x300 +/* ECC is calculated per PC on every beat */ +#define HBM_RD_ERR_SERR_BEAT0_SHIFT 16 +#define HBM_RD_ERR_SERR_BEAT0_MASK 0x10000 +#define HBM_RD_ERR_DERR_BEAT0_SHIFT 24 +#define HBM_RD_ERR_DERR_BEAT0_MASK 0x100000 + +struct hl_eq_hbm_sei_read_err_intr_info { + /* DFI_RD_ERR_REP_ADDR */ + struct hbm_rd_addr dbg_rd_err_addr; + /* DFI_RD_ERR_REP_ERR */ + union { + struct { + /* bit fields are only for FW use */ + u32 dbg_rd_err_par:8; + u32 dbg_rd_err_par_data:8; + u32 dbg_rd_err_serr:4; + u32 dbg_rd_err_derr:4; + u32 reserved:8; + }; + __le32 dbg_rd_err_misc; + }; + /* DFI_RD_ERR_REP_DM */ + __le32 dbg_rd_err_dm; + /* DFI_RD_ERR_REP_SYNDROME */ + __le32 dbg_rd_err_syndrome; + /* DFI_RD_ERR_REP_DATA */ + __le32 dbg_rd_err_data[HBM_RD_ERR_DATA_LIFO_LEN]; +}; + +struct hl_eq_hbm_sei_ca_par_intr_info { + /* 14 LSBs */ + __le16 dbg_row[HBM_CA_ERR_CMD_LIFO_LEN]; + /* 18 LSBs */ + __le32 dbg_col[HBM_CA_ERR_CMD_LIFO_LEN]; +}; + +#define WR_PAR_LAST_CMD_COL_SHIFT 0 +#define WR_PAR_LAST_CMD_COL_MASK 0x3F +#define WR_PAR_LAST_CMD_BG_SHIFT 6 +#define WR_PAR_LAST_CMD_BG_MASK 0xC0 +#define WR_PAR_LAST_CMD_BA_SHIFT 8 +#define WR_PAR_LAST_CMD_BA_MASK 0x300 +#define WR_PAR_LAST_CMD_SID_SHIFT 10 +#define WR_PAR_LAST_CMD_SID_MASK 0x400 + +/* Row address isn't latched */ +struct hbm_sei_wr_cmd_address { + /* DFI_DERR_LAST_CMD */ + union { + struct { + /* bit fields are only for FW use */ + u32 col:6; + u32 bg:2; + u32 ba:2; + u32 sid:1; + u32 reserved:21; + }; + __le32 dbg_wr_cmd_addr; + }; +}; + +struct hl_eq_hbm_sei_wr_par_intr_info { + /* entry 0: WR command address from the 1st cycle prior to the error + * entry 1: WR command address from the 2nd cycle prior to the error + * and so on... + */ + struct hbm_sei_wr_cmd_address dbg_last_wr_cmds[HBM_WR_PAR_CMD_LIFO_LEN]; + /* derr[0:1] - 1st HBM cycle DERR output + * derr[2:3] - 2nd HBM cycle DERR output + */ + __u8 dbg_derr; + /* extend to reach 8B */ + __u8 pad[3]; +}; + +/* + * this struct represents the following sei causes: + * command parity, ECC double error, ECC single error, dfi error, cattrip, + * temperature read-out, read parity error and write parity error. + * some only use the header while some have extra data. + */ +struct hl_eq_hbm_sei_data { + struct hl_hbm_sei_header hdr; + union { + struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_even_info; + struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_odd_info; + struct hl_eq_hbm_sei_read_err_intr_info read_err_info; + struct hl_eq_hbm_sei_wr_par_intr_info wr_parity_info; + }; +}; + +/* Engine/farm arc interrupt type */ +enum hl_engine_arc_interrupt_type { + /* Qman/farm ARC DCCM QUEUE FULL interrupt type */ + ENGINE_ARC_DCCM_QUEUE_FULL_IRQ = 1 +}; + +/* Data structure specifies details of payload of DCCM QUEUE FULL interrupt */ +struct hl_engine_arc_dccm_queue_full_irq { + /* Queue index value which caused DCCM QUEUE FULL */ + __le32 queue_index; + __le32 pad; +}; + +/* Data structure specifies details of QM/FARM ARC interrupt */ +struct hl_eq_engine_arc_intr_data { + /* ARC engine id e.g. DCORE0_TPC0_QM_ARC, DCORE0_TCP1_QM_ARC */ + __le32 engine_id; + __le32 intr_type; /* enum hl_engine_arc_interrupt_type */ + /* More info related to the interrupt e.g. queue index + * incase of DCCM_QUEUE_FULL interrupt. + */ + __le64 payload; + __le64 pad[5]; }; struct hl_eq_entry { struct hl_eq_header hdr; union { struct hl_eq_ecc_data ecc_data; - struct hl_eq_hbm_ecc_data hbm_ecc_data; + struct hl_eq_hbm_ecc_data hbm_ecc_data; /* Gaudi1 HBM */ struct hl_eq_sm_sei_data sm_sei_data; struct cpucp_pkt_sync_err pkt_sync_err; struct hl_eq_fw_alive fw_alive; - struct hl_eq_pcie_addr_dec_data pcie_addr_dec_data; + struct hl_eq_intr_cause intr_cause; + struct hl_eq_pcie_drain_ind_data pcie_drain_ind_data; + struct hl_eq_razwi_info razwi_info; + struct hl_eq_razwi_with_intr_cause razwi_with_intr_cause; + struct hl_eq_hbm_sei_data sei_data; /* Gaudi2 HBM */ + struct hl_eq_engine_arc_intr_data arc_data; __le64 data[7]; }; }; @@ -792,10 +1031,23 @@ struct cpucp_security_info { * @infineon_second_stage_version: Infineon 2nd stage DC-DC version. * @dram_size: available DRAM size. * @card_name: card name that will be displayed in HWMON subsystem on the host + * @tpc_binning_mask: TPC binning mask, 1 bit per TPC instance + * (0 = functional, 1 = binned) + * @decoder_binning_mask: Decoder binning mask, 1 bit per decoder instance + * (0 = functional, 1 = binned), maximum 1 per dcore + * @sram_binning: Categorize SRAM functionality + * (0 = fully functional, 1 = lower-half is not functional, + * 2 = upper-half is not functional) * @sec_info: security information * @pll_map: Bit map of supported PLLs for current ASIC version. * @mme_binning_mask: MME binning mask, - * (0 = functional, 1 = binned) + * bits [0:6] <==> dcore0 mme fma + * bits [7:13] <==> dcore1 mme fma + * bits [14:20] <==> dcore0 mme ima + * bits [21:27] <==> dcore1 mme ima + * For each group, if the 6th bit is set then first 5 bits + * represent the col's idx [0-31], otherwise these bits are + * ignored, and col idx 32 is binned. 7th bit is don't care. * @dram_binning_mask: DRAM binning mask, 1 bit per dram instance * (0 = functional 1 = binned) * @memory_repair_flag: eFuse flag indicating memory repair @@ -803,6 +1055,8 @@ struct cpucp_security_info { * (0 = functional 1 = binned) * @xbar_binning_mask: Xbar binning mask, 1 bit per Xbar instance * (0 = functional 1 = binned) + * @interposer_version: Interposer version programmed in eFuse + * @substrate_version: Substrate version programmed in eFuse * @fw_os_version: Firmware OS Version */ struct cpucp_info { @@ -819,16 +1073,18 @@ struct cpucp_info { __le32 infineon_second_stage_version; __le64 dram_size; char card_name[CARD_NAME_MAX_LEN]; - __le64 reserved3; - __le64 reserved4; - __u8 reserved5; + __le64 tpc_binning_mask; + __le64 decoder_binning_mask; + __u8 sram_binning; __u8 dram_binning_mask; __u8 memory_repair_flag; __u8 edma_binning_mask; __u8 xbar_binning_mask; - __u8 pad[3]; + __u8 interposer_version; + __u8 substrate_version; + __u8 reserved2; struct cpucp_security_info sec_info; - __le32 reserved6; + __le32 reserved3; __u8 pll_map[PLL_MAP_LEN]; __le64 mme_binning_mask; __u8 fw_os_version[VERSION_MAX_LEN]; @@ -932,6 +1188,11 @@ struct cpucp_hbm_row_replaced_rows_info { struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX]; }; +enum cpu_reset_status { + CPU_RST_STATUS_NA = 0, + CPU_RST_STATUS_SOFT_RST_DONE = 1, +}; + /* * struct dcore_monitor_regs_data - DCORE monitor regs data. * the structure follows sync manager block layout. relevant only to Gaudi. |