diff options
-rw-r--r-- | drivers/accel/habanalabs/common/device.c | 55 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs.h | 2 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi/gaudi.c | 3 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi2/gaudi2.c | 1 | ||||
-rw-r--r-- | drivers/accel/habanalabs/goya/goya.c | 1 |
5 files changed, 37 insertions, 25 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 3c1af9d43b65..fabfc501ef54 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1380,13 +1380,41 @@ static void device_disable_open_processes(struct hl_device *hdev, bool control_d mutex_unlock(fd_lock); } +static void send_disable_pci_access(struct hl_device *hdev, u32 flags) +{ + /* If reset is due to heartbeat, device CPU is no responsive in + * which case no point sending PCI disable message to it. + */ + if ((flags & HL_DRV_RESET_HARD) && + !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { + /* Disable PCI access from device F/W so he won't send + * us additional interrupts. We disable MSI/MSI-X at + * the halt_engines function and we can't have the F/W + * sending us interrupts after that. We need to disable + * the access here because if the device is marked + * disable, the message won't be send. Also, in case + * of heartbeat, the device CPU is marked as disable + * so this message won't be sent + */ + if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) { + dev_warn(hdev->dev, "Failed to disable FW's PCI access\n"); + return; + } + + /* verify that last EQs are handled before disabled is set */ + if (hdev->cpu_queues_enable) + synchronize_irq(pci_irq_vector(hdev->pdev, + hdev->asic_prop.eq_interrupt_id)); + } +} + static void handle_reset_trigger(struct hl_device *hdev, u32 flags) { u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; /* No consecutive mechanism when user context exists */ if (hdev->is_compute_ctx_active) - goto disable_pci; + return; /* * 'reset cause' is being updated here, because getting here @@ -1418,30 +1446,6 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) } else { hdev->reset_info.reset_trigger_repeated = 1; } - - /* If reset is due to heartbeat, device CPU is no responsive in - * which case no point sending PCI disable message to it. - * - * If F/W is performing the reset, no need to send it a message to disable - * PCI access - */ - -disable_pci: - if ((flags & HL_DRV_RESET_HARD) && - !(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) { - /* Disable PCI access from device F/W so he won't send - * us additional interrupts. We disable MSI/MSI-X at - * the halt_engines function and we can't have the F/W - * sending us interrupts after that. We need to disable - * the access here because if the device is marked - * disable, the message won't be send. Also, in case - * of heartbeat, the device CPU is marked as disable - * so this message won't be sent - */ - if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) - dev_warn(hdev->dev, - "Failed to disable FW's PCI access\n"); - } } /* @@ -1562,6 +1566,7 @@ do_reset: escalate_reset_flow: handle_reset_trigger(hdev, flags); + send_disable_pci_access(hdev, flags); /* This also blocks future CS/VM/JOB completion operations */ hdev->disabled = true; diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 7b6ad3d7dbaa..8c3bcc50e560 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -662,6 +662,7 @@ struct hl_hints_range { * @user_interrupt_count: number of user interrupts. * @user_dec_intr_count: number of decoder interrupts exposed to user. * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host. + * @eq_interrupt_id: interrupt id for EQ, uses to synchronize EQ interrupts in hard-reset. * @unexpected_user_error_interrupt_id: interrupt id used to indicate an unexpected user error. * @cache_line_size: device cache line size. * @server_type: Server type that the ASIC is currently installed in. @@ -793,6 +794,7 @@ struct asic_fixed_properties { u16 user_interrupt_count; u16 user_dec_intr_count; u16 tpc_interrupt_id; + u16 eq_interrupt_id; u16 unexpected_user_error_interrupt_id; u16 cache_line_size; u16 server_type; diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index 08a4b1cf2b42..2ad8e4efce7f 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -682,6 +682,9 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = USHRT_MAX; prop->tpc_interrupt_id = USHRT_MAX; + /* single msi */ + prop->eq_interrupt_id = 0; + for (i = 0 ; i < HL_MAX_DCORES ; i++) prop->first_available_cq[i] = USHRT_MAX; diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index ce85308d03e9..554020026da8 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -2439,6 +2439,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST; prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT; + prop->eq_interrupt_id = GAUDI2_IRQ_NUM_EVENT_QUEUE; prop->unexpected_user_error_interrupt_id = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR; prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER; diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c index 07d67878eac5..fb0ac9df841a 100644 --- a/drivers/accel/habanalabs/goya/goya.c +++ b/drivers/accel/habanalabs/goya/goya.c @@ -473,6 +473,7 @@ int goya_set_fixed_properties(struct hl_device *hdev) prop->first_available_user_interrupt = USHRT_MAX; prop->tpc_interrupt_id = USHRT_MAX; + prop->eq_interrupt_id = GOYA_EVENT_QUEUE_MSIX_IDX; for (i = 0 ; i < HL_MAX_DCORES ; i++) prop->first_available_cq[i] = USHRT_MAX; |