diff options
Diffstat (limited to 'drivers/misc/habanalabs/common/habanalabs_drv.c')
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_drv.c | 44 |
1 files changed, 28 insertions, 16 deletions
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index f733ead605e7..112632afe7d5 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -14,6 +14,9 @@ #include <linux/aer.h> #include <linux/module.h> +#define CREATE_TRACE_POINTS +#include <trace/events/habanalabs.h> + #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team" #define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators" @@ -27,7 +30,10 @@ static struct class *hl_class; static DEFINE_IDR(hl_devs_idr); static DEFINE_MUTEX(hl_devs_idr_lock); -static int timeout_locked = 30; +#define HL_DEFAULT_TIMEOUT_LOCKED 30 /* 30 seconds */ +#define GAUDI_DEFAULT_TIMEOUT_LOCKED 600 /* 10 minutes */ + +static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED; static int reset_on_lockup = 1; static int memory_scrub; static ulong boot_error_status_mask = ULONG_MAX; @@ -55,14 +61,12 @@ MODULE_PARM_DESC(boot_error_status_mask, #define PCI_IDS_GAUDI_SEC 0x1010 #define PCI_IDS_GAUDI2 0x1020 -#define PCI_IDS_GAUDI2_SEC 0x1030 static const struct pci_device_id ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), }, { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), }, { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), }, { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), }, - { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2_SEC), }, { 0, } }; MODULE_DEVICE_TABLE(pci, ids); @@ -92,9 +96,6 @@ static enum hl_asic_type get_asic_type(u16 device) case PCI_IDS_GAUDI2: asic_type = ASIC_GAUDI2; break; - case PCI_IDS_GAUDI2_SEC: - asic_type = ASIC_GAUDI2_SEC; - break; default: asic_type = ASIC_INVALID; break; @@ -107,7 +108,6 @@ static bool is_asic_secured(enum hl_asic_type asic_type) { switch (asic_type) { case ASIC_GAUDI_SEC: - case ASIC_GAUDI2_SEC: return true; default: return false; @@ -161,7 +161,7 @@ int hl_device_open(struct inode *inode, struct file *filp) mutex_lock(&hdev->fpriv_list_lock); if (!hl_device_operational(hdev, &status)) { - dev_err_ratelimited(hdev->dev, + dev_dbg_ratelimited(hdev->dev, "Can't open %s because it is %s\n", dev_name(hdev->dev), hdev->status[status]); @@ -207,11 +207,13 @@ int hl_device_open(struct inode *inode, struct file *filp) list_add(&hpriv->dev_node, &hdev->fpriv_list); mutex_unlock(&hdev->fpriv_list_lock); + hdev->asic_funcs->send_device_activity(hdev, true); + hl_debugfs_add_file(hpriv); - atomic_set(&hdev->last_error.cs_timeout.write_enable, 1); - atomic_set(&hdev->last_error.razwi.write_enable, 1); - hdev->last_error.undef_opcode.write_enable = true; + atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); + atomic_set(&hdev->captured_err_info.razwi.write_enable, 1); + hdev->captured_err_info.undef_opcode.write_enable = true; hdev->open_counter++; hdev->last_successful_open_jif = jiffies; @@ -269,7 +271,7 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) mutex_lock(&hdev->fpriv_ctrl_list_lock); if (!hl_device_operational(hdev, NULL)) { - dev_err_ratelimited(hdev->dev_ctrl, + dev_dbg_ratelimited(hdev->dev_ctrl, "Can't open %s because it is disabled or in reset\n", dev_name(hdev->dev_ctrl)); rc = -EPERM; @@ -314,12 +316,22 @@ static void copy_kernel_module_params_to_device(struct hl_device *hdev) hdev->boot_error_status_mask = boot_error_status_mask; } -static void fixup_device_params_per_asic(struct hl_device *hdev) +static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout) { switch (hdev->asic_type) { - case ASIC_GOYA: case ASIC_GAUDI: case ASIC_GAUDI_SEC: + /* If user didn't request a different timeout than the default one, we have + * a different default timeout for Gaudi + */ + if (timeout == HL_DEFAULT_TIMEOUT_LOCKED) + hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED * + MSEC_PER_SEC); + + hdev->reset_upon_device_release = 0; + break; + + case ASIC_GOYA: hdev->reset_upon_device_release = 0; break; @@ -339,7 +351,7 @@ static int fixup_device_params(struct hl_device *hdev) hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC; if (tmp_timeout) - hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * 1000); + hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC); else hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; @@ -360,7 +372,7 @@ static int fixup_device_params(struct hl_device *hdev) if (!hdev->cpu_queues_enable) hdev->heartbeat = 0; - fixup_device_params_per_asic(hdev); + fixup_device_params_per_asic(hdev, tmp_timeout); return 0; } |