summaryrefslogtreecommitdiff
path: root/drivers/accel/habanalabs/common/device.c
diff options
context:
space:
mode:
authorOfir Bitton <obitton@habana.ai>2023-04-18 14:48:22 +0300
committerOded Gabbay <ogabbay@kernel.org>2023-06-08 12:35:54 +0300
commitd8b9cea584661b30305cf341bf9f675dc0a25471 (patch)
tree39425aecf5c4735bf8676c0cf564437fbf639fe6 /drivers/accel/habanalabs/common/device.c
parent3d21ec6424e6d38f284c37d77e7ec524c1a454f2 (diff)
downloadlinux-d8b9cea584661b30305cf341bf9f675dc0a25471.tar.xz
accel/habanalabs: add pci health check during heartbeat
Currently upon a heartbeat failure, we don't know if the failure is due to firmware hang or due to a bad PCI link. Hence, we are reading a PCI config space register with a known value (vendor ID) so we will know which of the two possibilities caused the heartbeat failure. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/accel/habanalabs/common/device.c')
-rw-r--r--drivers/accel/habanalabs/common/device.c15
1 files changed, 14 insertions, 1 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index c8f4a34d2831..98b46b2e1898 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -981,6 +981,18 @@ static void device_early_fini(struct hl_device *hdev)
hdev->asic_funcs->early_fini(hdev);
}
+static bool is_pci_link_healthy(struct hl_device *hdev)
+{
+ u16 vendor_id;
+
+ if (!hdev->pdev)
+ return false;
+
+ pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
+
+ return (vendor_id == PCI_VENDOR_ID_HABANALABS);
+}
+
static void hl_device_heartbeat(struct work_struct *work)
{
struct hl_device *hdev = container_of(work, struct hl_device,
@@ -995,7 +1007,8 @@ static void hl_device_heartbeat(struct work_struct *work)
goto reschedule;
if (hl_device_operational(hdev, NULL))
- dev_err(hdev->dev, "Device heartbeat failed!\n");
+ dev_err(hdev->dev, "Device heartbeat failed! PCI link is %s\n",
+ is_pci_link_healthy(hdev) ? "healthy" : "broken");
info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
info.event_mask = &event_mask;