diff options
Diffstat (limited to 'drivers/accel/habanalabs/common/device.c')
-rw-r--r-- | drivers/accel/habanalabs/common/device.c | 48 |
1 files changed, 26 insertions, 22 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 68eebed3b050..999c92d7036e 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1066,28 +1066,11 @@ static bool is_pci_link_healthy(struct hl_device *hdev) return (device_id == hdev->pdev->device); } -static void stringify_time_of_last_heartbeat(struct hl_device *hdev, char *time_str, size_t size, - bool is_pq_hb) -{ - time64_t seconds = is_pq_hb ? hdev->heartbeat_debug_info.last_pq_heartbeat_ts - : hdev->heartbeat_debug_info.last_eq_heartbeat_ts; - struct tm tm; - - if (!seconds) - return; - - time64_to_tm(seconds, 0, &tm); - - snprintf(time_str, size, "%ld-%02d-%02d %02d:%02d:%02d (UTC)", - tm.tm_year + 1900, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); -} - static bool hl_device_eq_heartbeat_received(struct hl_device *hdev) { struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info; u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << 1) - 1; struct asic_fixed_properties *prop = &hdev->asic_prop; - char pq_time_str[64] = "N/A", eq_time_str[64] = "N/A"; if (!prop->cpucp_info.eq_health_check_supported) return true; @@ -1095,17 +1078,15 @@ static bool hl_device_eq_heartbeat_received(struct hl_device *hdev) if (!hdev->eq_heartbeat_received) { dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); - stringify_time_of_last_heartbeat(hdev, pq_time_str, sizeof(pq_time_str), true); - stringify_time_of_last_heartbeat(hdev, eq_time_str, sizeof(eq_time_str), false); dev_err(hdev->dev, - "EQ: {CI %u, HB counter %u, last HB time: %s}, PQ: {PI: %u, CI: %u (%u), last HB time: %s}\n", + "EQ: {CI %u, HB counter %u, last HB time: %ptTs}, PQ: {PI: %u, CI: %u (%u), last HB time: %ptTs}\n", hdev->event_queue.ci, heartbeat_debug_info->heartbeat_event_counter, - eq_time_str, + &hdev->heartbeat_debug_info.last_eq_heartbeat_ts, hdev->kernel_queues[cpu_q_id].pi, atomic_read(&hdev->kernel_queues[cpu_q_id].ci), atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask, - pq_time_str); + &hdev->heartbeat_debug_info.last_pq_heartbeat_ts); hl_eq_dump(hdev, &hdev->event_queue); @@ -1649,6 +1630,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR); reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release; + if (hdev->cpld_shutdown) { + dev_err(hdev->dev, "Cannot reset device, cpld is shutdown! Device is NOT usable\n"); + return -EIO; + } + if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) { dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n"); return 0; @@ -2595,6 +2581,14 @@ void hl_device_fini(struct hl_device *hdev) if (rc) dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc); + /* Reset the H/W (if it accessible). It will be in idle state after this returns */ + if (!hdev->cpld_shutdown) { + rc = hdev->asic_funcs->hw_fini(hdev, true, false); + if (rc) + dev_err(hdev->dev, + "hw_fini failed in device fini while removing device %d\n", rc); + } + hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; /* Release kernel context */ @@ -2962,3 +2956,13 @@ void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *eve mutex_unlock(&clk_throttle->lock); } + +void hl_eq_cpld_shutdown_event_handle(struct hl_device *hdev, u16 event_id, u64 *event_mask) +{ + hl_handle_critical_hw_err(hdev, event_id, event_mask); + *event_mask |= HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; + + /* Avoid any new accesses to the H/W */ + hdev->disabled = true; + hdev->cpld_shutdown = true; +} |