summaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/common/device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs/common/device.c')
-rw-r--r--drivers/misc/habanalabs/common/device.c163
1 files changed, 86 insertions, 77 deletions
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index ff4cbde289c0..97c7c86580e6 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -7,11 +7,11 @@
#define pr_fmt(fmt) "habanalabs: " fmt
+#include <uapi/misc/habanalabs.h>
#include "habanalabs.h"
#include <linux/pci.h>
#include <linux/hwmon.h>
-#include <uapi/misc/habanalabs.h>
enum hl_device_status hl_device_status(struct hl_device *hdev)
{
@@ -23,6 +23,8 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
status = HL_DEVICE_STATUS_NEEDS_RESET;
else if (hdev->disabled)
status = HL_DEVICE_STATUS_MALFUNCTION;
+ else if (!hdev->init_done)
+ status = HL_DEVICE_STATUS_IN_DEVICE_CREATION;
else
status = HL_DEVICE_STATUS_OPERATIONAL;
@@ -44,6 +46,7 @@ bool hl_device_operational(struct hl_device *hdev,
case HL_DEVICE_STATUS_NEEDS_RESET:
return false;
case HL_DEVICE_STATUS_OPERATIONAL:
+ case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
default:
return true;
}
@@ -129,8 +132,8 @@ static int hl_device_release(struct inode *inode, struct file *filp)
hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
if (!hl_hpriv_put(hpriv))
- dev_warn(hdev->dev,
- "Device is still in use because there are live CS and/or memory mappings\n");
+ dev_notice(hdev->dev,
+ "User process closed FD but device still in use\n");
hdev->last_open_session_duration_jif =
jiffies - hdev->last_successful_open_jif;
@@ -308,9 +311,15 @@ static void device_hard_reset_pending(struct work_struct *work)
container_of(work, struct hl_device_reset_work,
reset_work.work);
struct hl_device *hdev = device_reset_work->hdev;
+ u32 flags;
int rc;
- rc = hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD);
+ flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD;
+
+ if (device_reset_work->fw_reset)
+ flags |= HL_RESET_FW;
+
+ rc = hl_device_reset(hdev, flags);
if ((rc == -EBUSY) && !hdev->device_fini_pending) {
dev_info(hdev->dev,
"Could not reset device. will try again in %u seconds",
@@ -682,6 +691,44 @@ out:
return rc;
}
+static void take_release_locks(struct hl_device *hdev)
+{
+ /* Flush anyone that is inside the critical section of enqueue
+ * jobs to the H/W
+ */
+ hdev->asic_funcs->hw_queues_lock(hdev);
+ hdev->asic_funcs->hw_queues_unlock(hdev);
+
+ /* Flush processes that are sending message to CPU */
+ mutex_lock(&hdev->send_cpu_message_lock);
+ mutex_unlock(&hdev->send_cpu_message_lock);
+
+ /* Flush anyone that is inside device open */
+ mutex_lock(&hdev->fpriv_list_lock);
+ mutex_unlock(&hdev->fpriv_list_lock);
+}
+
+static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset)
+{
+ if (hard_reset)
+ device_late_fini(hdev);
+
+ /*
+ * Halt the engines and disable interrupts so we won't get any more
+ * completions from H/W and we won't have any accesses from the
+ * H/W to the host machine
+ */
+ hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
+
+ /* Go over all the queues, release all CS and their jobs */
+ hl_cs_rollback_all(hdev);
+
+ /* Release all pending user interrupts, each pending user interrupt
+ * holds a reference to user context
+ */
+ hl_release_pending_user_interrupts(hdev);
+}
+
/*
* hl_device_suspend - initiate device suspend
*
@@ -707,16 +754,7 @@ int hl_device_suspend(struct hl_device *hdev)
/* This blocks all other stuff that is not blocked by in_reset */
hdev->disabled = true;
- /*
- * Flush anyone that is inside the critical section of enqueue
- * jobs to the H/W
- */
- hdev->asic_funcs->hw_queues_lock(hdev);
- hdev->asic_funcs->hw_queues_unlock(hdev);
-
- /* Flush processes that are sending message to CPU */
- mutex_lock(&hdev->send_cpu_message_lock);
- mutex_unlock(&hdev->send_cpu_message_lock);
+ take_release_locks(hdev);
rc = hdev->asic_funcs->suspend(hdev);
if (rc)
@@ -819,6 +857,11 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
usleep_range(1000, 10000);
put_task_struct(task);
+ } else {
+ dev_warn(hdev->dev,
+ "Can't get task struct for PID so giving up on killing process\n");
+ mutex_unlock(&hdev->fpriv_list_lock);
+ return -ETIME;
}
}
@@ -885,7 +928,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
int hl_device_reset(struct hl_device *hdev, u32 flags)
{
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
- bool hard_reset, from_hard_reset_thread, hard_instead_soft = false;
+ bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
int i, rc;
if (!hdev->init_done) {
@@ -894,8 +937,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
return 0;
}
- hard_reset = (flags & HL_RESET_HARD) != 0;
- from_hard_reset_thread = (flags & HL_RESET_FROM_RESET_THREAD) != 0;
+ hard_reset = !!(flags & HL_RESET_HARD);
+ from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD);
+ fw_reset = !!(flags & HL_RESET_FW);
if (!hard_reset && !hdev->supports_soft_reset) {
hard_instead_soft = true;
@@ -947,11 +991,13 @@ do_reset:
else
hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
- /*
- * if reset is due to heartbeat, device CPU is no responsive in
- * which case no point sending PCI disable message to it
+ /* If reset is due to heartbeat, device CPU is no responsive in
+ * which case no point sending PCI disable message to it.
+ *
+ * If F/W is performing the reset, no need to send it a message to disable
+ * PCI access
*/
- if (hard_reset && !(flags & HL_RESET_HEARTBEAT)) {
+ if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
/* Disable PCI access from device F/W so he won't send
* us additional interrupts. We disable MSI/MSI-X at
* the halt_engines function and we can't have the F/W
@@ -970,15 +1016,7 @@ do_reset:
/* This also blocks future CS/VM/JOB completion operations */
hdev->disabled = true;
- /* Flush anyone that is inside the critical section of enqueue
- * jobs to the H/W
- */
- hdev->asic_funcs->hw_queues_lock(hdev);
- hdev->asic_funcs->hw_queues_unlock(hdev);
-
- /* Flush anyone that is inside device open */
- mutex_lock(&hdev->fpriv_list_lock);
- mutex_unlock(&hdev->fpriv_list_lock);
+ take_release_locks(hdev);
dev_err(hdev->dev, "Going to RESET device!\n");
}
@@ -989,6 +1027,8 @@ again:
hdev->process_kill_trial_cnt = 0;
+ hdev->device_reset_work.fw_reset = fw_reset;
+
/*
* Because the reset function can't run from heartbeat work,
* we need to call the reset function from a dedicated work.
@@ -999,31 +1039,7 @@ again:
return 0;
}
- if (hard_reset) {
- device_late_fini(hdev);
-
- /*
- * Now that the heartbeat thread is closed, flush processes
- * which are sending messages to CPU
- */
- mutex_lock(&hdev->send_cpu_message_lock);
- mutex_unlock(&hdev->send_cpu_message_lock);
- }
-
- /*
- * Halt the engines and disable interrupts so we won't get any more
- * completions from H/W and we won't have any accesses from the
- * H/W to the host machine
- */
- hdev->asic_funcs->halt_engines(hdev, hard_reset);
-
- /* Go over all the queues, release all CS and their jobs */
- hl_cs_rollback_all(hdev);
-
- /* Release all pending user interrupts, each pending user interrupt
- * holds a reference to user context
- */
- hl_release_pending_user_interrupts(hdev);
+ cleanup_resources(hdev, hard_reset, fw_reset);
kill_processes:
if (hard_reset) {
@@ -1057,12 +1073,15 @@ kill_processes:
}
/* Reset the H/W. It will be in idle state after this returns */
- hdev->asic_funcs->hw_fini(hdev, hard_reset);
+ hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
if (hard_reset) {
+ hdev->fw_loader.linux_loaded = false;
+
/* Release kernel context */
if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
hdev->kernel_ctx = NULL;
+
hl_vm_fini(hdev);
hl_mmu_fini(hdev);
hl_eq_reset(hdev, &hdev->event_queue);
@@ -1292,6 +1311,10 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
if (rc)
goto user_interrupts_fini;
+
+ /* initialize completion structure for multi CS wait */
+ hl_multi_cs_completion_init(hdev);
+
/*
* Initialize the H/W queues. Must be done before hw_init, because
* there the addresses of the kernel queue are being written to the
@@ -1361,6 +1384,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
hdev->compute_ctx = NULL;
+ hdev->asic_funcs->state_dump_init(hdev);
+
hl_debugfs_add_device(hdev);
/* debugfs nodes are created in hl_ctx_init so it must be called after
@@ -1567,31 +1592,13 @@ void hl_device_fini(struct hl_device *hdev)
/* Mark device as disabled */
hdev->disabled = true;
- /* Flush anyone that is inside the critical section of enqueue
- * jobs to the H/W
- */
- hdev->asic_funcs->hw_queues_lock(hdev);
- hdev->asic_funcs->hw_queues_unlock(hdev);
-
- /* Flush anyone that is inside device open */
- mutex_lock(&hdev->fpriv_list_lock);
- mutex_unlock(&hdev->fpriv_list_lock);
+ take_release_locks(hdev);
hdev->hard_reset_pending = true;
hl_hwmon_fini(hdev);
- device_late_fini(hdev);
-
- /*
- * Halt the engines and disable interrupts so we won't get any more
- * completions from H/W and we won't have any accesses from the
- * H/W to the host machine
- */
- hdev->asic_funcs->halt_engines(hdev, true);
-
- /* Go over all the queues, release all CS and their jobs */
- hl_cs_rollback_all(hdev);
+ cleanup_resources(hdev, true, false);
/* Kill processes here after CS rollback. This is because the process
* can't really exit until all its CSs are done, which is what we
@@ -1610,7 +1617,9 @@ void hl_device_fini(struct hl_device *hdev)
hl_cb_pool_fini(hdev);
/* Reset the H/W. It will be in idle state after this returns */
- hdev->asic_funcs->hw_fini(hdev, true);
+ hdev->asic_funcs->hw_fini(hdev, true, false);
+
+ hdev->fw_loader.linux_loaded = false;
/* Release kernel context */
if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))