diff options
Diffstat (limited to 'drivers/misc/habanalabs/gaudi/gaudi.c')
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudi.c | 313 |
1 files changed, 221 insertions, 92 deletions
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 825737dfe381..013c6da2e3ca 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2020 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -593,26 +593,27 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) else prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE; prop->mmu_pte_size = HL_PTE_SIZE; - prop->mmu_hop_table_size = HOP_TABLE_SIZE; - prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE; + prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE; + prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE; prop->dram_page_size = PAGE_SIZE_2MB; prop->dram_supports_virtual_memory = false; - prop->pmmu.hop0_shift = HOP0_SHIFT; - prop->pmmu.hop1_shift = HOP1_SHIFT; - prop->pmmu.hop2_shift = HOP2_SHIFT; - prop->pmmu.hop3_shift = HOP3_SHIFT; - prop->pmmu.hop4_shift = HOP4_SHIFT; - prop->pmmu.hop0_mask = HOP0_MASK; - prop->pmmu.hop1_mask = HOP1_MASK; - prop->pmmu.hop2_mask = HOP2_MASK; - prop->pmmu.hop3_mask = HOP3_MASK; - prop->pmmu.hop4_mask = HOP4_MASK; + prop->pmmu.hop0_shift = MMU_V1_1_HOP0_SHIFT; + prop->pmmu.hop1_shift = MMU_V1_1_HOP1_SHIFT; + prop->pmmu.hop2_shift = MMU_V1_1_HOP2_SHIFT; + prop->pmmu.hop3_shift = MMU_V1_1_HOP3_SHIFT; + prop->pmmu.hop4_shift = MMU_V1_1_HOP4_SHIFT; + prop->pmmu.hop0_mask = MMU_V1_1_HOP0_MASK; + prop->pmmu.hop1_mask = MMU_V1_1_HOP1_MASK; + prop->pmmu.hop2_mask = MMU_V1_1_HOP2_MASK; + prop->pmmu.hop3_mask = MMU_V1_1_HOP3_MASK; + prop->pmmu.hop4_mask = MMU_V1_1_HOP4_MASK; prop->pmmu.start_addr = VA_HOST_SPACE_START; prop->pmmu.end_addr = (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2) - 1; prop->pmmu.page_size = PAGE_SIZE_4KB; prop->pmmu.num_hops = MMU_ARCH_5_HOPS; + prop->pmmu.last_mask = LAST_MASK; /* PMMU and HPMMU are the same except of page size */ memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu)); @@ -664,6 +665,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) prop->clk_pll_index = HL_GAUDI_MME_PLL; prop->max_freq_value = GAUDI_MAX_CLK_FREQ; + prop->use_get_power_for_reset_history = true; + return 0; } @@ -878,6 +881,11 @@ static int gaudi_fetch_psoc_frequency(struct hl_device *hdev) int rc; if (hdev->asic_prop.fw_security_enabled) { + struct gaudi_device *gaudi = hdev->asic_specific; + + if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) + return 0; + rc = hl_fw_cpucp_pll_info_get(hdev, HL_GAUDI_CPU_PLL, pll_freq_arr); if (rc) @@ -1273,6 +1281,7 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs) container_of(cs->signal_fence, struct hl_cs_compl, base_fence); struct hl_cs_compl *cs_cmpl = container_of(cs->fence, struct hl_cs_compl, base_fence); + struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl; struct gaudi_collective_properties *cprop; u32 stream, queue_id, sob_group_offset; struct gaudi_device *gaudi; @@ -1285,10 +1294,16 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs) gaudi = hdev->asic_specific; cprop = &gaudi->collective_props; - /* In encaps signals case the SOB info will be retrieved from - * the handle in gaudi_collective_slave_init_job. - */ - if (!cs->encaps_signals) { + if (cs->encaps_signals) { + cs_cmpl->hw_sob = handle->hw_sob; + /* at this checkpoint we only need the hw_sob pointer + * for the completion check before start going over the jobs + * of the master/slaves, the sob_value will be taken later on + * in gaudi_collective_slave_init_job depends on each + * job wait offset value. + */ + cs_cmpl->sob_val = 0; + } else { /* copy the SOB id and value of the signal CS */ cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob; cs_cmpl->sob_val = signal_cs_cmpl->sob_val; @@ -1621,6 +1636,8 @@ static int gaudi_late_init(struct hl_device *hdev) */ gaudi_mmu_prepare(hdev, 1); + hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST); + return 0; disable_pci_access: @@ -4006,7 +4023,7 @@ static void gaudi_init_firmware_loader(struct hl_device *hdev) struct fw_load_mgr *fw_loader = &hdev->fw_loader; /* fill common fields */ - fw_loader->linux_loaded = false; + fw_loader->fw_comp_loaded = FW_TYPE_NONE; fw_loader->boot_fit_img.image_name = GAUDI_BOOT_FIT_FILE; fw_loader->linux_img.image_name = GAUDI_LINUX_FW_FILE; fw_loader->cpu_timeout = GAUDI_CPU_TIMEOUT_USEC; @@ -4289,13 +4306,31 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset * via the GIC. Otherwise, we need to use COMMS or the MSG_TO_CPU * registers in case of old F/Ws */ - if (hdev->fw_loader.linux_loaded) { + if (hdev->fw_loader.fw_comp_loaded & FW_TYPE_LINUX) { irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ? mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR : le32_to_cpu(dyn_regs->gic_host_halt_irq); WREG32(irq_handler_offset, gaudi_irq_map_table[GAUDI_EVENT_HALT_MACHINE].cpu_id); + + /* This is a hail-mary attempt to revive the card in the small chance that the + * f/w has experienced a watchdog event, which caused it to return back to preboot. + * In that case, triggering reset through GIC won't help. We need to trigger the + * reset as if Linux wasn't loaded. + * + * We do it only if the reset cause was HB, because that would be the indication + * of such an event. + * + * In case watchdog hasn't expired but we still got HB, then this won't do any + * damage. + */ + if (hdev->reset_info.curr_reset_cause == HL_RESET_CAUSE_HEARTBEAT) { + if (hdev->asic_prop.hard_reset_done_by_fw) + hl_fw_ask_hard_reset_without_linux(hdev); + else + hl_fw_ask_halt_machine_without_linux(hdev); + } } else { if (hdev->asic_prop.hard_reset_done_by_fw) hl_fw_ask_hard_reset_without_linux(hdev); @@ -6412,6 +6447,7 @@ static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size, { u32 dma_core_sts0, err_cause, cfg1, size_left, pos, size_to_dma; struct gaudi_device *gaudi = hdev->asic_specific; + u32 qm_glbl_sts0, qm_cgm_sts; u64 dma_offset, qm_offset; dma_addr_t dma_addr; void *kernel_addr; @@ -6436,14 +6472,20 @@ static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size, dma_offset = dma_id * DMA_CORE_OFFSET; qm_offset = dma_id * DMA_QMAN_OFFSET; dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset); - is_eng_idle = IS_DMA_IDLE(dma_core_sts0); + qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset); + qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) && + IS_DMA_IDLE(dma_core_sts0); if (!is_eng_idle) { dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_2]; dma_offset = dma_id * DMA_CORE_OFFSET; qm_offset = dma_id * DMA_QMAN_OFFSET; dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset); - is_eng_idle = IS_DMA_IDLE(dma_core_sts0); + qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset); + qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset); + is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) && + IS_DMA_IDLE(dma_core_sts0); if (!is_eng_idle) { dev_err_ratelimited(hdev->dev, @@ -6522,7 +6564,7 @@ static u64 gaudi_read_pte(struct hl_device *hdev, u64 addr) { struct gaudi_device *gaudi = hdev->asic_specific; - if (hdev->hard_reset_pending) + if (hdev->reset_info.hard_reset_pending) return U64_MAX; return readq(hdev->pcie_bar[HBM_BAR_ID] + @@ -6533,7 +6575,7 @@ static void gaudi_write_pte(struct hl_device *hdev, u64 addr, u64 val) { struct gaudi_device *gaudi = hdev->asic_specific; - if (hdev->hard_reset_pending) + if (hdev->reset_info.hard_reset_pending) return; writeq(val, hdev->pcie_bar[HBM_BAR_ID] + @@ -6935,8 +6977,9 @@ event_not_supported: snprintf(desc, size, "N/A"); } -static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, - u32 x_y, bool is_write) +static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y, + bool is_write, s32 *engine_id_1, + s32 *engine_id_2) { u32 dma_id[2], dma_offset, err_cause[2], mask, i; @@ -6976,44 +7019,64 @@ static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, switch (x_y) { case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_1: - if ((err_cause[0] & mask) && !(err_cause[1] & mask)) + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_0; return "DMA0"; - else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_2; return "DMA2"; - else + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_0; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_2; return "DMA0 or DMA2"; + } case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_1: - if ((err_cause[0] & mask) && !(err_cause[1] & mask)) + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_1; return "DMA1"; - else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_3; return "DMA3"; - else + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_1; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_3; return "DMA1 or DMA3"; + } case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1: - if ((err_cause[0] & mask) && !(err_cause[1] & mask)) + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_4; return "DMA4"; - else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_6; return "DMA6"; - else + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_4; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_6; return "DMA4 or DMA6"; + } case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1: - if ((err_cause[0] & mask) && !(err_cause[1] & mask)) + if ((err_cause[0] & mask) && !(err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_5; return "DMA5"; - else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) + } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_7; return "DMA7"; - else + } else { + *engine_id_1 = GAUDI_ENGINE_ID_DMA_5; + *engine_id_2 = GAUDI_ENGINE_ID_DMA_7; return "DMA5 or DMA7"; + } } unknown_initiator: return "unknown initiator"; } -static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, - bool is_write) +static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write, + u32 *engine_id_1, u32 *engine_id_2) { u32 val, x_y, axi_id; @@ -7026,24 +7089,35 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, switch (x_y) { case RAZWI_INITIATOR_ID_X_Y_TPC0_NIC0: - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_0; return "TPC0"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_0; return "NIC0"; + } break; case RAZWI_INITIATOR_ID_X_Y_TPC1: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_1; return "TPC1"; case RAZWI_INITIATOR_ID_X_Y_MME0_0: case RAZWI_INITIATOR_ID_X_Y_MME0_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_0; return "MME0"; case RAZWI_INITIATOR_ID_X_Y_MME1_0: case RAZWI_INITIATOR_ID_X_Y_MME1_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_1; return "MME1"; case RAZWI_INITIATOR_ID_X_Y_TPC2: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_2; return "TPC2"; case RAZWI_INITIATOR_ID_X_Y_TPC3_PCI_CPU_PSOC: - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_3; return "TPC3"; + } + /* PCI, CPU or PSOC does not have engine id*/ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_PCI)) return "PCI"; if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_CPU)) @@ -7059,32 +7133,49 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0: case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1: - return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write); + return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write, + engine_id_1, engine_id_2); case RAZWI_INITIATOR_ID_X_Y_TPC4_NIC1_NIC2: - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_4; return "TPC4"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_1; return "NIC1"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_2; return "NIC2"; + } break; case RAZWI_INITIATOR_ID_X_Y_TPC5: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_5; return "TPC5"; case RAZWI_INITIATOR_ID_X_Y_MME2_0: case RAZWI_INITIATOR_ID_X_Y_MME2_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_2; return "MME2"; case RAZWI_INITIATOR_ID_X_Y_MME3_0: case RAZWI_INITIATOR_ID_X_Y_MME3_1: + *engine_id_1 = GAUDI_ENGINE_ID_MME_3; return "MME3"; case RAZWI_INITIATOR_ID_X_Y_TPC6: + *engine_id_1 = GAUDI_ENGINE_ID_TPC_6; return "TPC6"; case RAZWI_INITIATOR_ID_X_Y_TPC7_NIC4_NIC5: - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) { + *engine_id_1 = GAUDI_ENGINE_ID_TPC_7; return "TPC7"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_4; return "NIC4"; - if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) + } + if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) { + *engine_id_1 = GAUDI_ENGINE_ID_NIC_5; return "NIC5"; + } break; default: break; @@ -7101,27 +7192,28 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, return "unknown initiator"; } -static void gaudi_print_razwi_info(struct hl_device *hdev) +static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1, + u32 *engine_id_2) { + if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) { dev_err_ratelimited(hdev->dev, "RAZWI event caused by illegal write of %s\n", - gaudi_get_razwi_initiator_name(hdev, true)); + gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0); } if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) { dev_err_ratelimited(hdev->dev, "RAZWI event caused by illegal read of %s\n", - gaudi_get_razwi_initiator_name(hdev, false)); + gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_READ_VLD, 0); } } -static void gaudi_print_mmu_error_info(struct hl_device *hdev) +static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type) { struct gaudi_device *gaudi = hdev->asic_specific; - u64 addr; u32 val; if (!(gaudi->hw_cap_initialized & HW_CAP_MMU)) @@ -7129,24 +7221,24 @@ static void gaudi_print_mmu_error_info(struct hl_device *hdev) val = RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE); if (val & MMU_UP_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) { - addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK; - addr <<= 32; - addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); + *addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK; + *addr <<= 32; + *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); - dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", - addr); + dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr); + *type = HL_RAZWI_PAGE_FAULT; WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0); } val = RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE); if (val & MMU_UP_ACCESS_ERROR_CAPTURE_ENTRY_VALID_MASK) { - addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK; - addr <<= 32; - addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA); + *addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK; + *addr <<= 32; + *addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA); - dev_err_ratelimited(hdev->dev, - "MMU access error on va 0x%llx\n", addr); + dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr); + *type = HL_RAZWI_MMU_ACCESS_ERROR; WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0); } @@ -7665,15 +7757,46 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type) static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, bool razwi) { + u32 engine_id_1, engine_id_2; char desc[64] = ""; + u64 razwi_addr = 0; + u8 razwi_type; + int rc; + + /* + * Init engine id by default as not valid and only if razwi initiated from engine with + * engine id it will get valid value. + * Init razwi type to default, will be changed only if razwi caused by page fault of + * MMU access error + */ + engine_id_1 = U16_MAX; + engine_id_2 = U16_MAX; + razwi_type = U8_MAX; gaudi_get_event_desc(event_type, desc, sizeof(desc)); dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", event_type, desc); if (razwi) { - gaudi_print_razwi_info(hdev); - gaudi_print_mmu_error_info(hdev); + gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2); + gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type); + + /* In case it's the first razwi, save its parameters*/ + rc = atomic_cmpxchg(&hdev->last_error.razwi_write_disable, 0, 1); + if (!rc) { + hdev->last_error.open_dev_timestamp = hdev->last_successful_open_ktime; + hdev->last_error.razwi_timestamp = ktime_get(); + hdev->last_error.razwi_addr = razwi_addr; + hdev->last_error.razwi_engine_id_1 = engine_id_1; + hdev->last_error.razwi_engine_id_2 = engine_id_2; + /* + * If first engine id holds non valid value the razwi initiator + * does not have engine id + */ + hdev->last_error.razwi_non_engine_initiator = (engine_id_1 == U16_MAX); + hdev->last_error.razwi_type = razwi_type; + + } } } @@ -7696,14 +7819,10 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev, fw_alive->thread_id, fw_alive->uptime_seconds); } -static int gaudi_soft_reset_late_init(struct hl_device *hdev) +static int gaudi_non_hard_reset_late_init(struct hl_device *hdev) { - struct gaudi_device *gaudi = hdev->asic_specific; - - /* Unmask all IRQs since some could have been received - * during the soft reset - */ - return hl_fw_unmask_irq_arr(hdev, gaudi->events, sizeof(gaudi->events)); + /* GAUDI doesn't support any reset except hard-reset */ + return -EPERM; } static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device, @@ -7897,27 +8016,39 @@ static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type) static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type) { + ktime_t zero_time = ktime_set(0, 0); + + mutex_lock(&hdev->clk_throttling.lock); + switch (event_type) { case GAUDI_EVENT_FIX_POWER_ENV_S: - hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get(); + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time; dev_info_ratelimited(hdev->dev, "Clock throttling due to power consumption\n"); break; case GAUDI_EVENT_FIX_POWER_ENV_E: - hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get(); dev_info_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n"); break; case GAUDI_EVENT_FIX_THERMAL_ENV_S: - hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get(); + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time; dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n"); break; case GAUDI_EVENT_FIX_THERMAL_ENV_E: - hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL; + hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get(); dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n"); break; @@ -7927,6 +8058,8 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, event_type); break; } + + mutex_unlock(&hdev->clk_throttling.lock); } static void gaudi_handle_eqe(struct hl_device *hdev, @@ -7975,7 +8108,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR: gaudi_print_irq_info(hdev, event_type, true); gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); - fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR; + fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; goto reset_device; case GAUDI_EVENT_GIC500: @@ -7983,7 +8116,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_L2_RAM_ECC: case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17: gaudi_print_irq_info(hdev, event_type, false); - fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR; + fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; goto reset_device; case GAUDI_EVENT_HBM0_SPI_0: @@ -7994,7 +8127,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, gaudi_hbm_read_interrupts(hdev, gaudi_hbm_event_to_dev(event_type), &eq_entry->hbm_ecc_data); - fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR; + fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; goto reset_device; case GAUDI_EVENT_HBM0_SPI_1: @@ -8177,9 +8310,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev, reset_device: if (hdev->asic_prop.fw_security_enabled) - hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW | fw_fatal_err_flag); + hl_device_reset(hdev, HL_DRV_RESET_HARD + | HL_DRV_RESET_BYPASS_REQ_TO_FW + | fw_fatal_err_flag); else if (hdev->hard_reset_on_fw_events) - hl_device_reset(hdev, HL_RESET_HARD | fw_fatal_err_flag); + hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag); else hl_fw_unmask_irq(hdev, event_type); } @@ -8206,7 +8341,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, int rc; if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) || - hdev->hard_reset_pending) + hdev->reset_info.hard_reset_pending) return 0; if (hdev->pldm) @@ -8229,12 +8364,6 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, WREG32(mmSTLB_INV_SET, 0); - if (rc) { - dev_err_ratelimited(hdev->dev, - "MMU cache invalidation timeout\n"); - hl_device_reset(hdev, HL_RESET_HARD); - } - return rc; } @@ -8662,7 +8791,7 @@ static int gaudi_internal_cb_pool_init(struct hl_device *hdev, hdev->internal_cb_pool_dma_addr, HOST_SPACE_INTERNAL_CB_SZ); - hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR); + hdev->asic_funcs->mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR); mutex_unlock(&ctx->mmu_lock); if (rc) @@ -8697,7 +8826,7 @@ static void gaudi_internal_cb_pool_fini(struct hl_device *hdev, HOST_SPACE_INTERNAL_CB_SZ); hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base, HOST_SPACE_INTERNAL_CB_SZ); - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR); + hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR); mutex_unlock(&ctx->mmu_lock); gen_pool_destroy(hdev->internal_cb_pool); @@ -9458,7 +9587,7 @@ static const struct hl_asic_funcs gaudi_funcs = { .disable_clock_gating = gaudi_disable_clock_gating, .debug_coresight = gaudi_debug_coresight, .is_device_idle = gaudi_is_device_idle, - .soft_reset_late_init = gaudi_soft_reset_late_init, + .non_hard_reset_late_init = gaudi_non_hard_reset_late_init, .hw_queues_lock = gaudi_hw_queues_lock, .hw_queues_unlock = gaudi_hw_queues_unlock, .get_pci_id = gaudi_get_pci_id, |