summaryrefslogtreecommitdiff
path: root/drivers/accel/habanalabs/common/device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/accel/habanalabs/common/device.c')
-rw-r--r--drivers/accel/habanalabs/common/device.c48
1 files changed, 26 insertions, 22 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index 68eebed3b050..999c92d7036e 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1066,28 +1066,11 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
return (device_id == hdev->pdev->device);
}
-static void stringify_time_of_last_heartbeat(struct hl_device *hdev, char *time_str, size_t size,
- bool is_pq_hb)
-{
- time64_t seconds = is_pq_hb ? hdev->heartbeat_debug_info.last_pq_heartbeat_ts
- : hdev->heartbeat_debug_info.last_eq_heartbeat_ts;
- struct tm tm;
-
- if (!seconds)
- return;
-
- time64_to_tm(seconds, 0, &tm);
-
- snprintf(time_str, size, "%ld-%02d-%02d %02d:%02d:%02d (UTC)",
- tm.tm_year + 1900, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
-}
-
static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
{
struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info;
u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << 1) - 1;
struct asic_fixed_properties *prop = &hdev->asic_prop;
- char pq_time_str[64] = "N/A", eq_time_str[64] = "N/A";
if (!prop->cpucp_info.eq_health_check_supported)
return true;
@@ -1095,17 +1078,15 @@ static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
if (!hdev->eq_heartbeat_received) {
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
- stringify_time_of_last_heartbeat(hdev, pq_time_str, sizeof(pq_time_str), true);
- stringify_time_of_last_heartbeat(hdev, eq_time_str, sizeof(eq_time_str), false);
dev_err(hdev->dev,
- "EQ: {CI %u, HB counter %u, last HB time: %s}, PQ: {PI: %u, CI: %u (%u), last HB time: %s}\n",
+ "EQ: {CI %u, HB counter %u, last HB time: %ptTs}, PQ: {PI: %u, CI: %u (%u), last HB time: %ptTs}\n",
hdev->event_queue.ci,
heartbeat_debug_info->heartbeat_event_counter,
- eq_time_str,
+ &hdev->heartbeat_debug_info.last_eq_heartbeat_ts,
hdev->kernel_queues[cpu_q_id].pi,
atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask,
- pq_time_str);
+ &hdev->heartbeat_debug_info.last_pq_heartbeat_ts);
hl_eq_dump(hdev, &hdev->event_queue);
@@ -1649,6 +1630,11 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR);
reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release;
+ if (hdev->cpld_shutdown) {
+ dev_err(hdev->dev, "Cannot reset device, cpld is shutdown! Device is NOT usable\n");
+ return -EIO;
+ }
+
if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) {
dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n");
return 0;
@@ -2595,6 +2581,14 @@ void hl_device_fini(struct hl_device *hdev)
if (rc)
dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc);
+ /* Reset the H/W (if it accessible). It will be in idle state after this returns */
+ if (!hdev->cpld_shutdown) {
+ rc = hdev->asic_funcs->hw_fini(hdev, true, false);
+ if (rc)
+ dev_err(hdev->dev,
+ "hw_fini failed in device fini while removing device %d\n", rc);
+ }
+
hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
/* Release kernel context */
@@ -2962,3 +2956,13 @@ void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *eve
mutex_unlock(&clk_throttle->lock);
}
+
+void hl_eq_cpld_shutdown_event_handle(struct hl_device *hdev, u16 event_id, u64 *event_mask)
+{
+ hl_handle_critical_hw_err(hdev, event_id, event_mask);
+ *event_mask |= HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
+
+ /* Avoid any new accesses to the H/W */
+ hdev->disabled = true;
+ hdev->cpld_shutdown = true;
+}