diff options
Diffstat (limited to 'drivers/accel/habanalabs/common/device.c')
-rw-r--r-- | drivers/accel/habanalabs/common/device.c | 367 |
1 files changed, 234 insertions, 133 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 9933e5858a36..fabfc501ef54 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -22,7 +22,6 @@ enum dma_alloc_type { DMA_ALLOC_COHERENT, - DMA_ALLOC_CPU_ACCESSIBLE, DMA_ALLOC_POOL, }; @@ -121,9 +120,6 @@ static void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t case DMA_ALLOC_COHERENT: ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag); break; - case DMA_ALLOC_CPU_ACCESSIBLE: - ptr = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle); - break; case DMA_ALLOC_POOL: ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle); break; @@ -147,9 +143,6 @@ static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *c case DMA_ALLOC_COHERENT: hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle); break; - case DMA_ALLOC_CPU_ACCESSIBLE: - hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, cpu_addr); - break; case DMA_ALLOC_POOL: hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle); break; @@ -170,18 +163,6 @@ void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, DMA_ALLOC_COHERENT, caller); } -void *hl_cpu_accessible_dma_pool_alloc_caller(struct hl_device *hdev, size_t size, - dma_addr_t *dma_handle, const char *caller) -{ - return hl_dma_alloc_common(hdev, size, dma_handle, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller); -} - -void hl_cpu_accessible_dma_pool_free_caller(struct hl_device *hdev, size_t size, void *vaddr, - const char *caller) -{ - hl_asic_dma_free_common(hdev, size, vaddr, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller); -} - void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags, dma_addr_t *dma_handle, const char *caller) { @@ -194,6 +175,16 @@ void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_ hl_asic_dma_free_common(hdev, 0, vaddr, dma_addr, DMA_ALLOC_POOL, caller); } +void *hl_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle) +{ + return hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle); +} + +void hl_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, void *vaddr) +{ + hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, vaddr); +} + int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir) { struct asic_fixed_properties *prop = &hdev->asic_prop; @@ -389,18 +380,17 @@ bool hl_ctrl_device_operational(struct hl_device *hdev, static void print_idle_status_mask(struct hl_device *hdev, const char *message, u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE]) { - u32 pad_width[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {}; - - BUILD_BUG_ON(HL_BUSY_ENGINES_MASK_EXT_SIZE != 4); - - pad_width[3] = idle_mask[3] ? 16 : 0; - pad_width[2] = idle_mask[2] || pad_width[3] ? 16 : 0; - pad_width[1] = idle_mask[1] || pad_width[2] ? 16 : 0; - pad_width[0] = idle_mask[0] || pad_width[1] ? 16 : 0; - - dev_err(hdev->dev, "%s (mask %0*llx_%0*llx_%0*llx_%0*llx)\n", - message, pad_width[3], idle_mask[3], pad_width[2], idle_mask[2], - pad_width[1], idle_mask[1], pad_width[0], idle_mask[0]); + if (idle_mask[3]) + dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx_%016llx)\n", + message, idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]); + else if (idle_mask[2]) + dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx)\n", + message, idle_mask[2], idle_mask[1], idle_mask[0]); + else if (idle_mask[1]) + dev_err(hdev->dev, "%s (mask %#llx_%016llx)\n", + message, idle_mask[1], idle_mask[0]); + else + dev_err(hdev->dev, "%s (mask %#llx)\n", message, idle_mask[0]); } static void hpriv_release(struct kref *ref) @@ -423,6 +413,9 @@ static void hpriv_release(struct kref *ref) mutex_destroy(&hpriv->ctx_lock); mutex_destroy(&hpriv->restore_phase_mutex); + /* There should be no memory buffers at this point and handles IDR can be destroyed */ + hl_mem_mgr_idr_destroy(&hpriv->mem_mgr); + /* Device should be reset if reset-upon-device-release is enabled, or if there is a pending * reset that waits for device release. */ @@ -492,6 +485,36 @@ int hl_hpriv_put(struct hl_fpriv *hpriv) return kref_put(&hpriv->refcount, hpriv_release); } +static void print_device_in_use_info(struct hl_device *hdev, const char *message) +{ + u32 active_cs_num, dmabuf_export_cnt; + bool unknown_reason = true; + char buf[128]; + size_t size; + int offset; + + size = sizeof(buf); + offset = 0; + + active_cs_num = hl_get_active_cs_num(hdev); + if (active_cs_num) { + unknown_reason = false; + offset += scnprintf(buf + offset, size - offset, " [%u active CS]", active_cs_num); + } + + dmabuf_export_cnt = atomic_read(&hdev->dmabuf_export_cnt); + if (dmabuf_export_cnt) { + unknown_reason = false; + offset += scnprintf(buf + offset, size - offset, " [%u exported dma-buf]", + dmabuf_export_cnt); + } + + if (unknown_reason) + scnprintf(buf + offset, size - offset, " [unknown reason]"); + + dev_notice(hdev->dev, "%s%s\n", message, buf); +} + /* * hl_device_release - release function for habanalabs device * @@ -514,17 +537,20 @@ static int hl_device_release(struct inode *inode, struct file *filp) } hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr); + + /* Memory buffers might be still in use at this point and thus the handles IDR destruction + * is postponed to hpriv_release(). + */ hl_mem_mgr_fini(&hpriv->mem_mgr); hdev->compute_ctx_in_release = 1; if (!hl_hpriv_put(hpriv)) { - dev_notice(hdev->dev, "User process closed FD but device still in use\n"); + print_device_in_use_info(hdev, "User process closed FD but device still in use"); hl_device_reset(hdev, HL_DRV_RESET_HARD); } - hdev->last_open_session_duration_jif = - jiffies - hdev->last_successful_open_jif; + hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif; return 0; } @@ -617,7 +643,7 @@ static void device_release_func(struct device *dev) * device_init_cdev - Initialize cdev and device for habanalabs device * * @hdev: pointer to habanalabs device structure - * @hclass: pointer to the class object of the device + * @class: pointer to the class object of the device * @minor: minor number of the specific device * @fpos: file operations to install for this device * @name: name of the device as it will appear in the filesystem @@ -626,7 +652,7 @@ static void device_release_func(struct device *dev) * * Initialize a cdev and a Linux device for habanalabs's device. */ -static int device_init_cdev(struct hl_device *hdev, struct class *hclass, +static int device_init_cdev(struct hl_device *hdev, struct class *class, int minor, const struct file_operations *fops, char *name, struct cdev *cdev, struct device **dev) @@ -640,7 +666,7 @@ static int device_init_cdev(struct hl_device *hdev, struct class *hclass, device_initialize(*dev); (*dev)->devt = MKDEV(hdev->major, minor); - (*dev)->class = hclass; + (*dev)->class = class; (*dev)->release = device_release_func; dev_set_drvdata(*dev, hdev); dev_set_name(*dev, "%s", name); @@ -733,14 +759,14 @@ static void device_hard_reset_pending(struct work_struct *work) static void device_release_watchdog_func(struct work_struct *work) { - struct hl_device_reset_work *device_release_watchdog_work = - container_of(work, struct hl_device_reset_work, reset_work.work); - struct hl_device *hdev = device_release_watchdog_work->hdev; + struct hl_device_reset_work *watchdog_work = + container_of(work, struct hl_device_reset_work, reset_work.work); + struct hl_device *hdev = watchdog_work->hdev; u32 flags; - dev_dbg(hdev->dev, "Device wasn't released in time. Initiate device reset.\n"); + dev_dbg(hdev->dev, "Device wasn't released in time. Initiate hard-reset.\n"); - flags = device_release_watchdog_work->flags | HL_DRV_RESET_FROM_WD_THR; + flags = watchdog_work->flags | HL_DRV_RESET_HARD | HL_DRV_RESET_FROM_WD_THR; hl_device_reset(hdev, flags); } @@ -805,7 +831,7 @@ static int device_early_init(struct hl_device *hdev) } for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { - snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i); + snprintf(workq_name, 32, "hl%u-free-jobs-%u", hdev->cdev_idx, (u32) i); hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); if (hdev->cq_wq[i] == NULL) { dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); @@ -814,14 +840,16 @@ static int device_early_init(struct hl_device *hdev) } } - hdev->eq_wq = create_singlethread_workqueue("hl-events"); + snprintf(workq_name, 32, "hl%u-events", hdev->cdev_idx); + hdev->eq_wq = create_singlethread_workqueue(workq_name); if (hdev->eq_wq == NULL) { dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); rc = -ENOMEM; goto free_cq_wq; } - hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0); + snprintf(workq_name, 32, "hl%u-cs-completions", hdev->cdev_idx); + hdev->cs_cmplt_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0); if (!hdev->cs_cmplt_wq) { dev_err(hdev->dev, "Failed to allocate CS completions workqueue\n"); @@ -829,7 +857,8 @@ static int device_early_init(struct hl_device *hdev) goto free_eq_wq; } - hdev->ts_free_obj_wq = alloc_workqueue("hl-ts-free-obj", WQ_UNBOUND, 0); + snprintf(workq_name, 32, "hl%u-ts-free-obj", hdev->cdev_idx); + hdev->ts_free_obj_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0); if (!hdev->ts_free_obj_wq) { dev_err(hdev->dev, "Failed to allocate Timestamp registration free workqueue\n"); @@ -837,15 +866,15 @@ static int device_early_init(struct hl_device *hdev) goto free_cs_cmplt_wq; } - hdev->prefetch_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0); + snprintf(workq_name, 32, "hl%u-prefetch", hdev->cdev_idx); + hdev->prefetch_wq = alloc_workqueue(workq_name, WQ_UNBOUND, 0); if (!hdev->prefetch_wq) { dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n"); rc = -ENOMEM; goto free_ts_free_wq; } - hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), - GFP_KERNEL); + hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), GFP_KERNEL); if (!hdev->hl_chip_info) { rc = -ENOMEM; goto free_prefetch_wq; @@ -857,7 +886,8 @@ static int device_early_init(struct hl_device *hdev) hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr); - hdev->reset_wq = create_singlethread_workqueue("hl_device_reset"); + snprintf(workq_name, 32, "hl%u_device_reset", hdev->cdev_idx); + hdev->reset_wq = create_singlethread_workqueue(workq_name); if (!hdev->reset_wq) { rc = -ENOMEM; dev_err(hdev->dev, "Failed to create device reset WQ\n"); @@ -887,6 +917,7 @@ static int device_early_init(struct hl_device *hdev) free_cb_mgr: hl_mem_mgr_fini(&hdev->kernel_mem_mgr); + hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr); free_chip_info: kfree(hdev->hl_chip_info); free_prefetch_wq: @@ -930,6 +961,7 @@ static void device_early_fini(struct hl_device *hdev) mutex_destroy(&hdev->clk_throttling.lock); hl_mem_mgr_fini(&hdev->kernel_mem_mgr); + hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr); kfree(hdev->hl_chip_info); @@ -953,6 +985,8 @@ static void hl_device_heartbeat(struct work_struct *work) { struct hl_device *hdev = container_of(work, struct hl_device, work_heartbeat.work); + struct hl_info_fw_err_info info = {0}; + u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; if (!hl_device_operational(hdev, NULL)) goto reschedule; @@ -963,7 +997,10 @@ static void hl_device_heartbeat(struct work_struct *work) if (hl_device_operational(hdev, NULL)) dev_err(hdev->dev, "Device heartbeat failed!\n"); - hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT); + info.err_type = HL_INFO_FW_HEARTBEAT_ERR; + info.event_mask = &event_mask; + hl_handle_fw_err(hdev, &info); + hl_device_cond_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT, event_mask); return; @@ -1234,7 +1271,6 @@ int hl_device_resume(struct hl_device *hdev) return 0; disable_device: - pci_clear_master(hdev->pdev); pci_disable_device(hdev->pdev); return rc; @@ -1344,6 +1380,34 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d mutex_unlock(fd_lock); } +static void send_disable_pci_access(struct hl_device *hdev, u32 flags) +{ + /* If reset is due to heartbeat, device CPU is no responsive in + * which case no point sending PCI disable message to it. + */ + if ((flags & HL_DRV_RESET_HARD) && + !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { + /* Disable PCI access from device F/W so he won't send + * us additional interrupts. We disable MSI/MSI-X at + * the halt_engines function and we can't have the F/W + * sending us interrupts after that. We need to disable + * the access here because if the device is marked + * disable, the message won't be send. Also, in case + * of heartbeat, the device CPU is marked as disable + * so this message won't be sent + */ + if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) { + dev_warn(hdev->dev, "Failed to disable FW's PCI access\n"); + return; + } + + /* verify that last EQs are handled before disabled is set */ + if (hdev->cpu_queues_enable) + synchronize_irq(pci_irq_vector(hdev->pdev, + hdev->asic_prop.eq_interrupt_id)); + } +} + static void handle_reset_trigger(struct hl_device *hdev, u32 flags) { u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; @@ -1382,28 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) } else { hdev->reset_info.reset_trigger_repeated = 1; } - - /* If reset is due to heartbeat, device CPU is no responsive in - * which case no point sending PCI disable message to it. - * - * If F/W is performing the reset, no need to send it a message to disable - * PCI access - */ - if ((flags & HL_DRV_RESET_HARD) && - !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { - /* Disable PCI access from device F/W so he won't send - * us additional interrupts. We disable MSI/MSI-X at - * the halt_engines function and we can't have the F/W - * sending us interrupts after that. We need to disable - * the access here because if the device is marked - * disable, the message won't be send. Also, in case - * of heartbeat, the device CPU is marked as disable - * so this message won't be sent - */ - if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) - dev_warn(hdev->dev, - "Failed to disable PCI access by F/W\n"); - } } /* @@ -1424,12 +1466,11 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) */ int hl_device_reset(struct hl_device *hdev, u32 flags) { - bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, - reset_upon_device_release = false, schedule_hard_reset = false, - delay_reset, from_dev_release, from_watchdog_thread; + bool hard_reset, from_hard_reset_thread, fw_reset, reset_upon_device_release, + schedule_hard_reset = false, delay_reset, from_dev_release, from_watchdog_thread; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; struct hl_ctx *ctx; - int i, rc; + int i, rc, hw_fini_rc; if (!hdev->init_done) { dev_err(hdev->dev, "Can't reset before initialization is done\n"); @@ -1442,6 +1483,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) from_dev_release = !!(flags & HL_DRV_RESET_DEV_RELEASE); delay_reset = !!(flags & HL_DRV_RESET_DELAY); from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR); + reset_upon_device_release = hdev->reset_upon_device_release && from_dev_release; if (!hard_reset && (hl_device_status(hdev) == HL_DEVICE_STATUS_MALFUNCTION)) { dev_dbg(hdev->dev, "soft-reset isn't supported on a malfunctioning device\n"); @@ -1449,30 +1491,26 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) } if (!hard_reset && !hdev->asic_prop.supports_compute_reset) { - hard_instead_soft = true; + dev_dbg(hdev->dev, "asic doesn't support compute reset - do hard-reset instead\n"); hard_reset = true; } - if (hdev->reset_upon_device_release && from_dev_release) { + if (reset_upon_device_release) { if (hard_reset) { dev_crit(hdev->dev, "Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n"); return -EINVAL; } - reset_upon_device_release = true; - goto do_reset; } if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) { - hard_instead_soft = true; + dev_dbg(hdev->dev, + "asic doesn't allow inference soft reset - do hard-reset instead\n"); hard_reset = true; } - if (hard_instead_soft) - dev_dbg(hdev->dev, "Doing hard-reset instead of compute reset\n"); - do_reset: /* Re-entry of reset thread */ if (from_hard_reset_thread && hdev->process_kill_trial_cnt) @@ -1480,14 +1518,14 @@ do_reset: /* * Prevent concurrency in this function - only one reset should be - * done at any given time. Only need to perform this if we didn't - * get from the dedicated hard reset thread + * done at any given time. We need to perform this only if we didn't + * get here from a dedicated hard reset thread. */ if (!from_hard_reset_thread) { /* Block future CS/VM/JOB completion operations */ spin_lock(&hdev->reset_info.lock); if (hdev->reset_info.in_reset) { - /* We only allow scheduling of a hard reset during compute reset */ + /* We allow scheduling of a hard reset only during a compute reset */ if (hard_reset && hdev->reset_info.in_compute_reset) hdev->reset_info.hard_reset_schedule_flags = flags; spin_unlock(&hdev->reset_info.lock); @@ -1505,15 +1543,17 @@ do_reset: /* Cancel the device release watchdog work if required. * In case of reset-upon-device-release while the release watchdog work is - * scheduled, do hard-reset instead of compute-reset. + * scheduled due to a hard-reset, do hard-reset instead of compute-reset. */ if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) { + struct hl_device_reset_work *watchdog_work = + &hdev->device_release_watchdog_work; + hdev->reset_info.watchdog_active = 0; if (!from_watchdog_thread) - cancel_delayed_work_sync( - &hdev->device_release_watchdog_work.reset_work); + cancel_delayed_work_sync(&watchdog_work->reset_work); - if (from_dev_release) { + if (from_dev_release && (watchdog_work->flags & HL_DRV_RESET_HARD)) { hdev->reset_info.in_compute_reset = 0; flags |= HL_DRV_RESET_HARD; flags &= ~HL_DRV_RESET_DEV_RELEASE; @@ -1524,7 +1564,9 @@ do_reset: if (delay_reset) usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1); +escalate_reset_flow: handle_reset_trigger(hdev, flags); + send_disable_pci_access(hdev, flags); /* This also blocks future CS/VM/JOB completion operations */ hdev->disabled = true; @@ -1539,7 +1581,6 @@ do_reset: dev_dbg(hdev->dev, "Going to reset engines of inference device\n"); } -again: if ((hard_reset) && (!from_hard_reset_thread)) { hdev->reset_info.hard_reset_pending = true; @@ -1592,7 +1633,7 @@ kill_processes: } /* Reset the H/W. It will be in idle state after this returns */ - hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); + hw_fini_rc = hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); if (hard_reset) { hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; @@ -1619,6 +1660,10 @@ kill_processes: hl_ctx_put(ctx); } + if (hw_fini_rc) { + rc = hw_fini_rc; + goto out_err; + } /* Finished tear-down, starting to re-initialize */ if (hard_reset) { @@ -1784,10 +1829,8 @@ kill_processes: dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); flags = hdev->reset_info.hard_reset_schedule_flags; hdev->reset_info.hard_reset_schedule_flags = 0; - hdev->disabled = true; hard_reset = true; - handle_reset_trigger(hdev, flags); - goto again; + goto escalate_reset_flow; } } @@ -1804,20 +1847,19 @@ out_err: "%s Failed to reset! Device is NOT usable\n", dev_name(&(hdev)->pdev->dev)); hdev->reset_info.hard_reset_cnt++; - } else if (reset_upon_device_release) { - spin_unlock(&hdev->reset_info.lock); - dev_err(hdev->dev, "Failed to reset device after user release\n"); - flags |= HL_DRV_RESET_HARD; - flags &= ~HL_DRV_RESET_DEV_RELEASE; - hard_reset = true; - goto again; } else { + if (reset_upon_device_release) { + dev_err(hdev->dev, "Failed to reset device after user release\n"); + flags &= ~HL_DRV_RESET_DEV_RELEASE; + } else { + dev_err(hdev->dev, "Failed to do compute reset\n"); + hdev->reset_info.compute_reset_cnt++; + } + spin_unlock(&hdev->reset_info.lock); - dev_err(hdev->dev, "Failed to do compute reset\n"); - hdev->reset_info.compute_reset_cnt++; flags |= HL_DRV_RESET_HARD; hard_reset = true; - goto again; + goto escalate_reset_flow; } hdev->reset_info.in_reset = 0; @@ -1840,10 +1882,6 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask) { struct hl_ctx *ctx = NULL; - /* Device release watchdog is only for hard reset */ - if (!(flags & HL_DRV_RESET_HARD) && hdev->asic_prop.allow_inference_soft_reset) - goto device_reset; - /* F/W reset cannot be postponed */ if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW) goto device_reset; @@ -1871,7 +1909,7 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask) goto out; hdev->device_release_watchdog_work.flags = flags; - dev_dbg(hdev->dev, "Device is going to be reset in %u sec unless being released\n", + dev_dbg(hdev->dev, "Device is going to be hard-reset in %u sec unless being released\n", hdev->device_release_watchdog_timeout_sec); schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work, msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000)); @@ -1939,37 +1977,27 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask) mutex_unlock(&hdev->fpriv_ctrl_list_lock); } -/* - * hl_device_init - main initialization function for habanalabs device - * - * @hdev: pointer to habanalabs device structure - * - * Allocate an id for the device, do early initialization and then call the - * ASIC specific initialization functions. Finally, create the cdev and the - * Linux device to expose it to the user - */ -int hl_device_init(struct hl_device *hdev, struct class *hclass) +static int create_cdev(struct hl_device *hdev) { - int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt; char *name; - bool add_cdev_sysfs_on_err = false; + int rc; hdev->cdev_idx = hdev->id / 2; name = kasprintf(GFP_KERNEL, "hl%d", hdev->cdev_idx); if (!name) { rc = -ENOMEM; - goto out_disabled; + goto out_err; } /* Initialize cdev and device structures */ - rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name, + rc = device_init_cdev(hdev, hdev->hclass, hdev->id, &hl_ops, name, &hdev->cdev, &hdev->dev); kfree(name); if (rc) - goto out_disabled; + goto out_err; name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->cdev_idx); if (!name) { @@ -1978,7 +2006,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) } /* Initialize cdev and device structures for control device */ - rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops, + rc = device_init_cdev(hdev, hdev->hclass, hdev->id_control, &hl_ctrl_ops, name, &hdev->cdev_ctrl, &hdev->dev_ctrl); kfree(name); @@ -1986,10 +2014,36 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) if (rc) goto free_dev; + return 0; + +free_dev: + put_device(hdev->dev); +out_err: + return rc; +} + +/* + * hl_device_init - main initialization function for habanalabs device + * + * @hdev: pointer to habanalabs device structure + * + * Allocate an id for the device, do early initialization and then call the + * ASIC specific initialization functions. Finally, create the cdev and the + * Linux device to expose it to the user + */ +int hl_device_init(struct hl_device *hdev) +{ + int i, rc, cq_cnt, user_interrupt_cnt, cq_ready_cnt; + bool add_cdev_sysfs_on_err = false; + + rc = create_cdev(hdev); + if (rc) + goto out_disabled; + /* Initialize ASIC function pointers and perform early init */ rc = device_early_init(hdev); if (rc) - goto free_dev_ctrl; + goto free_dev; user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count + hdev->asic_prop.user_interrupt_count; @@ -2241,9 +2295,8 @@ free_usr_intr_mem: kfree(hdev->user_interrupt); early_fini: device_early_fini(hdev); -free_dev_ctrl: - put_device(hdev->dev_ctrl); free_dev: + put_device(hdev->dev_ctrl); put_device(hdev->dev); out_disabled: hdev->disabled = true; @@ -2364,7 +2417,9 @@ void hl_device_fini(struct hl_device *hdev) hl_cb_pool_fini(hdev); /* Reset the H/W. It will be in idle state after this returns */ - hdev->asic_funcs->hw_fini(hdev, true, false); + rc = hdev->asic_funcs->hw_fini(hdev, true, false); + if (rc) + dev_err(hdev->dev, "hw_fini failed in device fini while removing device %d\n", rc); hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; @@ -2566,3 +2621,49 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_ if (event_mask) *event_mask |= HL_NOTIFIER_EVENT_PAGE_FAULT; } + +static void hl_capture_hw_err(struct hl_device *hdev, u16 event_id) +{ + struct hw_err_info *info = &hdev->captured_err_info.hw_err; + + /* Capture only the first HW err */ + if (atomic_cmpxchg(&info->event_detected, 0, 1)) + return; + + info->event.timestamp = ktime_to_ns(ktime_get()); + info->event.event_id = event_id; + + info->event_info_available = true; +} + +void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask) +{ + hl_capture_hw_err(hdev, event_id); + + if (event_mask) + *event_mask |= HL_NOTIFIER_EVENT_CRITICL_HW_ERR; +} + +static void hl_capture_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *fw_info) +{ + struct fw_err_info *info = &hdev->captured_err_info.fw_err; + + /* Capture only the first FW error */ + if (atomic_cmpxchg(&info->event_detected, 0, 1)) + return; + + info->event.timestamp = ktime_to_ns(ktime_get()); + info->event.err_type = fw_info->err_type; + if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR) + info->event.event_id = fw_info->event_id; + + info->event_info_available = true; +} + +void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info) +{ + hl_capture_fw_err(hdev, info); + + if (info->event_mask) + *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR; +} |