summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOded Gabbay <oded.gabbay@gmail.com>2019-02-28 11:46:12 +0300
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2019-02-28 15:04:59 +0300
commita28ce422a6d926c11d7e72a83ccaa4f9b06077ea (patch)
tree65f08f69e8eec0cf7d5272a87e2e612f975c04ec
parent27ca384cb7c44b8b16ea43f9aed930664140360e (diff)
downloadlinux-a28ce422a6d926c11d7e72a83ccaa4f9b06077ea.tar.xz
habanalabs: disable CPU access on timeouts
This patch provides a workaround for a bug in the F/W where the response time for a request from KMD may take more then 100ms. This could cause the queue between KMD and the F/W to get out of sync. The WA is to: 1. Increase the timeout of ALL requests to 1s. 2. In case a request isn't answered in time, mark the state as "cpu_disabled" and prevent sending further requests from KMD to the F/W. This will eventually lead to a heartbeat failure and hard reset of the device. Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--drivers/misc/habanalabs/debugfs.c6
-rw-r--r--drivers/misc/habanalabs/device.c2
-rw-r--r--drivers/misc/habanalabs/goya/goya.c9
-rw-r--r--drivers/misc/habanalabs/habanalabs.h2
-rw-r--r--drivers/misc/habanalabs/hwmon.c2
-rw-r--r--drivers/misc/habanalabs/sysfs.c4
6 files changed, 18 insertions, 7 deletions
diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c
index f472b572faea..1d2bbcf90f16 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -723,7 +723,7 @@ static ssize_t hl_device_read(struct file *f, char __user *buf,
return 0;
sprintf(tmp_buf,
- "Valid values are: disable, enable, suspend, resume\n");
+ "Valid values: disable, enable, suspend, resume, cpu_timeout\n");
rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
strlen(tmp_buf) + 1);
@@ -751,9 +751,11 @@ static ssize_t hl_device_write(struct file *f, const char __user *buf,
hdev->asic_funcs->suspend(hdev);
} else if (strncmp("resume", data, strlen("resume")) == 0) {
hdev->asic_funcs->resume(hdev);
+ } else if (strncmp("cpu_timeout", data, strlen("cpu_timeout")) == 0) {
+ hdev->device_cpu_disabled = true;
} else {
dev_err(hdev->dev,
- "Valid values are: disable, enable, suspend, resume\n");
+ "Valid values: disable, enable, suspend, resume, cpu_timeout\n");
count = -EINVAL;
}
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 120d30a13afb..de46aa6ed154 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -636,6 +636,8 @@ again:
/* Finished tear-down, starting to re-initialize */
if (hard_reset) {
+ hdev->device_cpu_disabled = false;
+
/* Allocate the kernel context */
hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
GFP_KERNEL);
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 7c2edabe20bd..5780041abe32 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -3232,6 +3232,11 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
if (hdev->disabled)
goto out;
+ if (hdev->device_cpu_disabled) {
+ rc = -EIO;
+ goto out;
+ }
+
rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_CPU_PQ, len,
pkt_dma_addr);
if (rc) {
@@ -3245,8 +3250,8 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_CPU_PQ);
if (rc == -ETIMEDOUT) {
- dev_err(hdev->dev,
- "Timeout while waiting for CPU packet fence\n");
+ dev_err(hdev->dev, "Timeout while waiting for device CPU\n");
+ hdev->device_cpu_disabled = true;
goto out;
}
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 59b25c6fae00..a7c95e9f9b9a 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1079,6 +1079,7 @@ struct hl_device_reset_work {
* @dram_default_page_mapping: is DRAM default page mapping enabled.
* @init_done: is the initialization of the device done.
* @mmu_enable: is MMU enabled.
+ * @device_cpu_disabled: is the device CPU disabled (due to timeouts)
*/
struct hl_device {
struct pci_dev *pdev;
@@ -1146,6 +1147,7 @@ struct hl_device {
u8 dram_supports_virtual_memory;
u8 dram_default_page_mapping;
u8 init_done;
+ u8 device_cpu_disabled;
/* Parameters for bring-up */
u8 mmu_enable;
diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/hwmon.c
index 9c359a1dd868..7eec21f9b96e 100644
--- a/drivers/misc/habanalabs/hwmon.c
+++ b/drivers/misc/habanalabs/hwmon.c
@@ -10,7 +10,7 @@
#include <linux/pci.h>
#include <linux/hwmon.h>
-#define SENSORS_PKT_TIMEOUT 100000 /* 100ms */
+#define SENSORS_PKT_TIMEOUT 1000000 /* 1s */
#define HWMON_NR_SENSOR_TYPES (hwmon_pwm + 1)
int hl_build_hwmon_channel_info(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/sysfs.c b/drivers/misc/habanalabs/sysfs.c
index 6d80e7e0885c..12c782112a8c 100644
--- a/drivers/misc/habanalabs/sysfs.c
+++ b/drivers/misc/habanalabs/sysfs.c
@@ -9,8 +9,8 @@
#include <linux/pci.h>
-#define SET_CLK_PKT_TIMEOUT 200000 /* 200ms */
-#define SET_PWR_PKT_TIMEOUT 400000 /* 400ms */
+#define SET_CLK_PKT_TIMEOUT 1000000 /* 1s */
+#define SET_PWR_PKT_TIMEOUT 1000000 /* 1s */
long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
{