summaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs')
-rw-r--r--drivers/misc/habanalabs/Makefile11
-rw-r--r--drivers/misc/habanalabs/common/Makefile7
-rw-r--r--drivers/misc/habanalabs/common/asid.c (renamed from drivers/misc/habanalabs/asid.c)0
-rw-r--r--drivers/misc/habanalabs/common/command_buffer.c (renamed from drivers/misc/habanalabs/command_buffer.c)82
-rw-r--r--drivers/misc/habanalabs/common/command_submission.c (renamed from drivers/misc/habanalabs/command_submission.c)113
-rw-r--r--drivers/misc/habanalabs/common/context.c (renamed from drivers/misc/habanalabs/context.c)39
-rw-r--r--drivers/misc/habanalabs/common/debugfs.c (renamed from drivers/misc/habanalabs/debugfs.c)29
-rw-r--r--drivers/misc/habanalabs/common/device.c (renamed from drivers/misc/habanalabs/device.c)90
-rw-r--r--drivers/misc/habanalabs/common/firmware_if.c (renamed from drivers/misc/habanalabs/firmware_if.c)114
-rw-r--r--drivers/misc/habanalabs/common/habanalabs.h (renamed from drivers/misc/habanalabs/habanalabs.h)191
-rw-r--r--drivers/misc/habanalabs/common/habanalabs_drv.c (renamed from drivers/misc/habanalabs/habanalabs_drv.c)3
-rw-r--r--drivers/misc/habanalabs/common/habanalabs_ioctl.c (renamed from drivers/misc/habanalabs/habanalabs_ioctl.c)24
-rw-r--r--drivers/misc/habanalabs/common/hw_queue.c (renamed from drivers/misc/habanalabs/hw_queue.c)165
-rw-r--r--drivers/misc/habanalabs/common/hwmon.c (renamed from drivers/misc/habanalabs/hwmon.c)19
-rw-r--r--drivers/misc/habanalabs/common/irq.c (renamed from drivers/misc/habanalabs/irq.c)38
-rw-r--r--drivers/misc/habanalabs/common/memory.c (renamed from drivers/misc/habanalabs/memory.c)5
-rw-r--r--drivers/misc/habanalabs/common/mmu.c (renamed from drivers/misc/habanalabs/mmu.c)3
-rw-r--r--drivers/misc/habanalabs/common/pci.c (renamed from drivers/misc/habanalabs/pci.c)151
-rw-r--r--drivers/misc/habanalabs/common/sysfs.c (renamed from drivers/misc/habanalabs/sysfs.c)14
-rw-r--r--drivers/misc/habanalabs/gaudi/Makefile2
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudi.c1069
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudiP.h27
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudi_coresight.c12
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c2
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudi_security.c5
-rw-r--r--drivers/misc/habanalabs/goya/Makefile2
-rw-r--r--drivers/misc/habanalabs/goya/goya.c216
-rw-r--r--drivers/misc/habanalabs/goya/goyaP.h24
-rw-r--r--drivers/misc/habanalabs/goya/goya_coresight.c15
-rw-r--r--drivers/misc/habanalabs/goya/goya_security.c2
-rw-r--r--drivers/misc/habanalabs/include/common/armcp_if.h (renamed from drivers/misc/habanalabs/include/armcp_if.h)14
-rw-r--r--drivers/misc/habanalabs/include/common/hl_boot_if.h (renamed from drivers/misc/habanalabs/include/hl_boot_if.h)14
-rw-r--r--drivers/misc/habanalabs/include/common/qman_if.h (renamed from drivers/misc/habanalabs/include/qman_if.h)0
-rw-r--r--drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h21
-rw-r--r--drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h114
-rw-r--r--drivers/misc/habanalabs/include/gaudi/gaudi_masks.h3
-rw-r--r--drivers/misc/habanalabs/include/gaudi/gaudi_packets.h7
37 files changed, 1493 insertions, 1154 deletions
diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
index 421ebd903069..a786c0a7de9a 100644
--- a/drivers/misc/habanalabs/Makefile
+++ b/drivers/misc/habanalabs/Makefile
@@ -3,16 +3,15 @@
# Makefile for HabanaLabs AI accelerators driver
#
-obj-m := habanalabs.o
+obj-$(CONFIG_HABANA_AI) := habanalabs.o
-habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
- command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \
- command_submission.o mmu.o firmware_if.o pci.o
-
-habanalabs-$(CONFIG_DEBUG_FS) += debugfs.o
+include $(src)/common/Makefile
+habanalabs-y += $(HL_COMMON_FILES)
include $(src)/goya/Makefile
habanalabs-y += $(HL_GOYA_FILES)
include $(src)/gaudi/Makefile
habanalabs-y += $(HL_GAUDI_FILES)
+
+habanalabs-$(CONFIG_DEBUG_FS) += common/debugfs.o
diff --git a/drivers/misc/habanalabs/common/Makefile b/drivers/misc/habanalabs/common/Makefile
new file mode 100644
index 000000000000..b984bfa4face
--- /dev/null
+++ b/drivers/misc/habanalabs/common/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
+ common/asid.o common/habanalabs_ioctl.o \
+ common/command_buffer.o common/hw_queue.o common/irq.o \
+ common/sysfs.o common/hwmon.o common/memory.o \
+ common/command_submission.o common/mmu.o common/firmware_if.o \
+ common/pci.o
diff --git a/drivers/misc/habanalabs/asid.c b/drivers/misc/habanalabs/common/asid.c
index a2fdf31cf27c..a2fdf31cf27c 100644
--- a/drivers/misc/habanalabs/asid.c
+++ b/drivers/misc/habanalabs/common/asid.c
diff --git a/drivers/misc/habanalabs/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c
index 02d13f71b1df..7c38c4f7f9c0 100644
--- a/drivers/misc/habanalabs/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -10,12 +10,18 @@
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/genalloc.h>
static void cb_fini(struct hl_device *hdev, struct hl_cb *cb)
{
- hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size,
- (void *) (uintptr_t) cb->kernel_address,
- cb->bus_address);
+ if (cb->is_internal)
+ gen_pool_free(hdev->internal_cb_pool,
+ cb->kernel_address, cb->size);
+ else
+ hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size,
+ (void *) (uintptr_t) cb->kernel_address,
+ cb->bus_address);
+
kfree(cb);
}
@@ -44,9 +50,10 @@ static void cb_release(struct kref *ref)
}
static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
- int ctx_id)
+ int ctx_id, bool internal_cb)
{
struct hl_cb *cb;
+ u32 cb_offset;
void *p;
/*
@@ -65,13 +72,25 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
if (!cb)
return NULL;
- if (ctx_id == HL_KERNEL_ASID_ID)
+ if (internal_cb) {
+ p = (void *) gen_pool_alloc(hdev->internal_cb_pool, cb_size);
+ if (!p) {
+ kfree(cb);
+ return NULL;
+ }
+
+ cb_offset = p - hdev->internal_cb_pool_virt_addr;
+ cb->is_internal = true;
+ cb->bus_address = hdev->internal_cb_va_base + cb_offset;
+ } else if (ctx_id == HL_KERNEL_ASID_ID) {
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size,
&cb->bus_address, GFP_ATOMIC);
- else
+ } else {
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, cb_size,
&cb->bus_address,
GFP_USER | __GFP_ZERO);
+ }
+
if (!p) {
dev_err(hdev->dev,
"failed to allocate %d of dma memory for CB\n",
@@ -87,7 +106,7 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
}
int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
- u32 cb_size, u64 *handle, int ctx_id)
+ u32 cb_size, u64 *handle, int ctx_id, bool internal_cb)
{
struct hl_cb *cb;
bool alloc_new_cb = true;
@@ -112,28 +131,30 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
goto out_err;
}
- /* Minimum allocation must be PAGE SIZE */
- if (cb_size < PAGE_SIZE)
- cb_size = PAGE_SIZE;
-
- if (ctx_id == HL_KERNEL_ASID_ID &&
- cb_size <= hdev->asic_prop.cb_pool_cb_size) {
-
- spin_lock(&hdev->cb_pool_lock);
- if (!list_empty(&hdev->cb_pool)) {
- cb = list_first_entry(&hdev->cb_pool, typeof(*cb),
- pool_list);
- list_del(&cb->pool_list);
- spin_unlock(&hdev->cb_pool_lock);
- alloc_new_cb = false;
- } else {
- spin_unlock(&hdev->cb_pool_lock);
- dev_dbg(hdev->dev, "CB pool is empty\n");
+ if (!internal_cb) {
+ /* Minimum allocation must be PAGE SIZE */
+ if (cb_size < PAGE_SIZE)
+ cb_size = PAGE_SIZE;
+
+ if (ctx_id == HL_KERNEL_ASID_ID &&
+ cb_size <= hdev->asic_prop.cb_pool_cb_size) {
+
+ spin_lock(&hdev->cb_pool_lock);
+ if (!list_empty(&hdev->cb_pool)) {
+ cb = list_first_entry(&hdev->cb_pool,
+ typeof(*cb), pool_list);
+ list_del(&cb->pool_list);
+ spin_unlock(&hdev->cb_pool_lock);
+ alloc_new_cb = false;
+ } else {
+ spin_unlock(&hdev->cb_pool_lock);
+ dev_dbg(hdev->dev, "CB pool is empty\n");
+ }
}
}
if (alloc_new_cb) {
- cb = hl_cb_alloc(hdev, cb_size, ctx_id);
+ cb = hl_cb_alloc(hdev, cb_size, ctx_id, internal_cb);
if (!cb) {
rc = -ENOMEM;
goto out_err;
@@ -229,8 +250,8 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
rc = -EINVAL;
} else {
rc = hl_cb_create(hdev, &hpriv->cb_mgr,
- args->in.cb_size, &handle,
- hpriv->ctx->asid);
+ args->in.cb_size, &handle,
+ hpriv->ctx->asid, false);
}
memset(args, 0, sizeof(*args));
@@ -398,14 +419,15 @@ void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr)
idr_destroy(&mgr->cb_handles);
}
-struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size)
+struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
+ bool internal_cb)
{
u64 cb_handle;
struct hl_cb *cb;
int rc;
rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle,
- HL_KERNEL_ASID_ID);
+ HL_KERNEL_ASID_ID, internal_cb);
if (rc) {
dev_err(hdev->dev,
"Failed to allocate CB for the kernel driver %d\n", rc);
@@ -437,7 +459,7 @@ int hl_cb_pool_init(struct hl_device *hdev)
for (i = 0 ; i < hdev->asic_prop.cb_pool_cb_cnt ; i++) {
cb = hl_cb_alloc(hdev, hdev->asic_prop.cb_pool_cb_size,
- HL_KERNEL_ASID_ID);
+ HL_KERNEL_ASID_ID, false);
if (cb) {
cb->is_pool = true;
list_add(&cb->pool_list, &hdev->cb_pool);
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index f82974a916c3..b9840e368eb5 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -62,6 +62,12 @@ static void hl_fence_release(struct dma_fence *fence)
container_of(fence, struct hl_cs_compl, base_fence);
struct hl_device *hdev = hl_cs_cmpl->hdev;
+ /* EBUSY means the CS was never submitted and hence we don't have
+ * an attached hw_sob object that we should handle here
+ */
+ if (fence->error == -EBUSY)
+ goto free;
+
if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
(hl_cs_cmpl->type == CS_TYPE_WAIT)) {
@@ -92,6 +98,7 @@ static void hl_fence_release(struct dma_fence *fence)
kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset);
}
+free:
kfree_rcu(hl_cs_cmpl, base_fence.rcu);
}
@@ -239,6 +246,18 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
kfree(job);
}
+static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx)
+{
+ hdev->aggregated_cs_counters.device_in_reset_drop_cnt +=
+ ctx->cs_counters.device_in_reset_drop_cnt;
+ hdev->aggregated_cs_counters.out_of_mem_drop_cnt +=
+ ctx->cs_counters.out_of_mem_drop_cnt;
+ hdev->aggregated_cs_counters.parsing_drop_cnt +=
+ ctx->cs_counters.parsing_drop_cnt;
+ hdev->aggregated_cs_counters.queue_full_drop_cnt +=
+ ctx->cs_counters.queue_full_drop_cnt;
+}
+
static void cs_do_release(struct kref *ref)
{
struct hl_cs *cs = container_of(ref, struct hl_cs,
@@ -328,21 +347,30 @@ static void cs_do_release(struct kref *ref)
hl_ctx_put(cs->ctx);
+ /* We need to mark an error for not submitted because in that case
+ * the dma fence release flow is different. Mainly, we don't need
+ * to handle hw_sob for signal/wait
+ */
if (cs->timedout)
dma_fence_set_error(cs->fence, -ETIMEDOUT);
else if (cs->aborted)
dma_fence_set_error(cs->fence, -EIO);
+ else if (!cs->submitted)
+ dma_fence_set_error(cs->fence, -EBUSY);
dma_fence_signal(cs->fence);
dma_fence_put(cs->fence);
+ cs_counters_aggregate(hdev, cs->ctx);
+
+ kfree(cs->jobs_in_queue_cnt);
kfree(cs);
}
static void cs_timedout(struct work_struct *work)
{
struct hl_device *hdev;
- int ctx_asid, rc;
+ int rc;
struct hl_cs *cs = container_of(work, struct hl_cs,
work_tdr.work);
rc = cs_get_unless_zero(cs);
@@ -358,11 +386,10 @@ static void cs_timedout(struct work_struct *work)
cs->timedout = true;
hdev = cs->ctx->hdev;
- ctx_asid = cs->ctx->asid;
- /* TODO: add information about last signaled seq and last emitted seq */
- dev_err(hdev->dev, "User %d command submission %llu got stuck!\n",
- ctx_asid, cs->sequence);
+ dev_err(hdev->dev,
+ "Command submission %llu has not finished in time!\n",
+ cs->sequence);
cs_put(cs);
@@ -405,21 +432,29 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
spin_lock(&ctx->cs_lock);
cs_cmpl->cs_seq = ctx->cs_sequence;
- other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)];
+ other = ctx->cs_pending[cs_cmpl->cs_seq &
+ (hdev->asic_prop.max_pending_cs - 1)];
if ((other) && (!dma_fence_is_signaled(other))) {
- spin_unlock(&ctx->cs_lock);
dev_dbg(hdev->dev,
"Rejecting CS because of too many in-flights CS\n");
rc = -EAGAIN;
goto free_fence;
}
+ cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
+ sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
+ if (!cs->jobs_in_queue_cnt) {
+ rc = -ENOMEM;
+ goto free_fence;
+ }
+
dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock,
ctx->asid, ctx->cs_sequence);
cs->sequence = cs_cmpl->cs_seq;
- ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] =
+ ctx->cs_pending[cs_cmpl->cs_seq &
+ (hdev->asic_prop.max_pending_cs - 1)] =
&cs_cmpl->base_fence;
ctx->cs_sequence++;
@@ -434,6 +469,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
return 0;
free_fence:
+ spin_unlock(&ctx->cs_lock);
kfree(cs_cmpl);
free_cs:
kfree(cs);
@@ -450,10 +486,12 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
void hl_cs_rollback_all(struct hl_device *hdev)
{
+ int i;
struct hl_cs *cs, *tmp;
/* flush all completions */
- flush_workqueue(hdev->cq_wq);
+ for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
+ flush_workqueue(hdev->cq_wq[i]);
/* Make sure we don't have leftovers in the H/W queues mirror list */
list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list,
@@ -486,10 +524,18 @@ static int validate_queue_index(struct hl_device *hdev,
struct asic_fixed_properties *asic = &hdev->asic_prop;
struct hw_queue_properties *hw_queue_prop;
+ /* This must be checked here to prevent out-of-bounds access to
+ * hw_queues_props array
+ */
+ if (chunk->queue_index >= asic->max_queues) {
+ dev_err(hdev->dev, "Queue index %d is invalid\n",
+ chunk->queue_index);
+ return -EINVAL;
+ }
+
hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
- if ((chunk->queue_index >= HL_MAX_QUEUES) ||
- (hw_queue_prop->type == QUEUE_TYPE_NA)) {
+ if (hw_queue_prop->type == QUEUE_TYPE_NA) {
dev_err(hdev->dev, "Queue index %d is invalid\n",
chunk->queue_index);
return -EINVAL;
@@ -617,12 +663,15 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
rc = validate_queue_index(hdev, chunk, &queue_type,
&is_kernel_allocated_cb);
- if (rc)
+ if (rc) {
+ hpriv->ctx->cs_counters.parsing_drop_cnt++;
goto free_cs_object;
+ }
if (is_kernel_allocated_cb) {
cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
if (!cb) {
+ hpriv->ctx->cs_counters.parsing_drop_cnt++;
rc = -EINVAL;
goto free_cs_object;
}
@@ -636,6 +685,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
job = hl_cs_allocate_job(hdev, queue_type,
is_kernel_allocated_cb);
if (!job) {
+ hpriv->ctx->cs_counters.out_of_mem_drop_cnt++;
dev_err(hdev->dev, "Failed to allocate a new job\n");
rc = -ENOMEM;
if (is_kernel_allocated_cb)
@@ -668,6 +718,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
rc = cs_parser(hpriv, job);
if (rc) {
+ hpriv->ctx->cs_counters.parsing_drop_cnt++;
dev_err(hdev->dev,
"Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
cs->ctx->asid, cs->sequence, job->id, rc);
@@ -676,6 +727,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
}
if (int_queues_only) {
+ hpriv->ctx->cs_counters.parsing_drop_cnt++;
dev_err(hdev->dev,
"Reject CS %d.%llu because only internal queues jobs are present\n",
cs->ctx->asid, cs->sequence);
@@ -725,6 +777,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
struct hl_cs_job *job;
struct hl_cs *cs;
struct hl_cb *cb;
+ enum hl_queue_type q_type;
u64 *signal_seq_arr = NULL, signal_seq;
u32 size_to_copy, q_idx, signal_seq_arr_len, cb_size;
int rc;
@@ -757,9 +810,10 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
chunk = &cs_chunk_array[0];
q_idx = chunk->queue_index;
hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
+ q_type = hw_queue_prop->type;
- if ((q_idx >= HL_MAX_QUEUES) ||
- (hw_queue_prop->type != QUEUE_TYPE_EXT)) {
+ if ((q_idx >= hdev->asic_prop.max_queues) ||
+ (!hw_queue_prop->supports_sync_stream)) {
dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx);
rc = -EINVAL;
goto free_cs_chunk_array;
@@ -856,25 +910,28 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
*cs_seq = cs->sequence;
- job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
+ job = hl_cs_allocate_job(hdev, q_type, true);
if (!job) {
+ ctx->cs_counters.out_of_mem_drop_cnt++;
dev_err(hdev->dev, "Failed to allocate a new job\n");
rc = -ENOMEM;
goto put_cs;
}
- cb = hl_cb_kernel_create(hdev, PAGE_SIZE);
+ if (cs->type == CS_TYPE_WAIT)
+ cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
+ else
+ cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
+
+ cb = hl_cb_kernel_create(hdev, cb_size,
+ q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
if (!cb) {
+ ctx->cs_counters.out_of_mem_drop_cnt++;
kfree(job);
rc = -EFAULT;
goto put_cs;
}
- if (cs->type == CS_TYPE_WAIT)
- cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
- else
- cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
-
job->id = 0;
job->cs = cs;
job->user_cb = cb;
@@ -1113,7 +1170,7 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
rc = PTR_ERR(fence);
if (rc == -EINVAL)
dev_notice_ratelimited(hdev->dev,
- "Can't wait on seq %llu because current CS is at seq %llu\n",
+ "Can't wait on CS %llu because current CS is at seq %llu\n",
seq, ctx->cs_sequence);
} else if (fence) {
rc = dma_fence_wait_timeout(fence, true, timeout);
@@ -1146,15 +1203,21 @@ int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
memset(args, 0, sizeof(*args));
if (rc < 0) {
- dev_err_ratelimited(hdev->dev,
- "Error %ld on waiting for CS handle %llu\n",
- rc, seq);
if (rc == -ERESTARTSYS) {
+ dev_err_ratelimited(hdev->dev,
+ "user process got signal while waiting for CS handle %llu\n",
+ seq);
args->out.status = HL_WAIT_CS_STATUS_INTERRUPTED;
rc = -EINTR;
} else if (rc == -ETIMEDOUT) {
+ dev_err_ratelimited(hdev->dev,
+ "CS %llu has timed-out while user process is waiting for it\n",
+ seq);
args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
} else if (rc == -EIO) {
+ dev_err_ratelimited(hdev->dev,
+ "CS %llu has been aborted while user process is waiting for it\n",
+ seq);
args->out.status = HL_WAIT_CS_STATUS_ABORTED;
}
return rc;
diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/common/context.c
index ec92b3506b1f..3e375958e73b 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
* to this function unless the ref count is 0
*/
- for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
+ for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
dma_fence_put(ctx->cs_pending[i]);
+ kfree(ctx->cs_pending);
+
if (ctx->asid != HL_KERNEL_ASID_ID) {
/* The engines are stopped as there is no executing CS, but the
* Coresight might be still working by accessing addresses
@@ -110,8 +112,7 @@ void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx)
return;
dev_warn(hdev->dev,
- "Context %d closed or terminated but its CS are executing\n",
- ctx->asid);
+ "user process released device but its command submissions are still executing\n");
}
int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
@@ -126,34 +127,49 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
spin_lock_init(&ctx->cs_lock);
atomic_set(&ctx->thread_ctx_switch_token, 1);
ctx->thread_ctx_switch_wait_token = 0;
+ ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
+ sizeof(struct dma_fence *),
+ GFP_KERNEL);
+ if (!ctx->cs_pending)
+ return -ENOMEM;
if (is_kernel_ctx) {
ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */
rc = hl_mmu_ctx_init(ctx);
if (rc) {
dev_err(hdev->dev, "Failed to init mmu ctx module\n");
- goto mem_ctx_err;
+ goto err_free_cs_pending;
}
} else {
ctx->asid = hl_asid_alloc(hdev);
if (!ctx->asid) {
dev_err(hdev->dev, "No free ASID, failed to create context\n");
- return -ENOMEM;
+ rc = -ENOMEM;
+ goto err_free_cs_pending;
}
rc = hl_vm_ctx_init(ctx);
if (rc) {
dev_err(hdev->dev, "Failed to init mem ctx module\n");
rc = -ENOMEM;
- goto mem_ctx_err;
+ goto err_asid_free;
+ }
+
+ rc = hdev->asic_funcs->ctx_init(ctx);
+ if (rc) {
+ dev_err(hdev->dev, "ctx_init failed\n");
+ goto err_vm_ctx_fini;
}
}
return 0;
-mem_ctx_err:
- if (ctx->asid != HL_KERNEL_ASID_ID)
- hl_asid_free(hdev, ctx->asid);
+err_vm_ctx_fini:
+ hl_vm_ctx_fini(ctx);
+err_asid_free:
+ hl_asid_free(hdev, ctx->asid);
+err_free_cs_pending:
+ kfree(ctx->cs_pending);
return rc;
}
@@ -170,6 +186,7 @@ int hl_ctx_put(struct hl_ctx *ctx)
struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
{
+ struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
struct dma_fence *fence;
spin_lock(&ctx->cs_lock);
@@ -179,13 +196,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
return ERR_PTR(-EINVAL);
}
- if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
+ if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) {
spin_unlock(&ctx->cs_lock);
return NULL;
}
fence = dma_fence_get(
- ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
+ ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
spin_unlock(&ctx->cs_lock);
return fence;
diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 3c8dcdfba20c..c50c6fc9e905 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -6,7 +6,7 @@
*/
#include "habanalabs.h"
-#include "include/hw_ip/mmu/mmu_general.h"
+#include "../include/hw_ip/mmu/mmu_general.h"
#include <linux/pci.h>
#include <linux/debugfs.h>
@@ -36,7 +36,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
pkt.i2c_reg = i2c_reg;
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- HL_DEVICE_TIMEOUT_USEC, (long *) val);
+ 0, (long *) val);
if (rc)
dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
@@ -63,7 +63,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
pkt.value = cpu_to_le64(val);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- HL_DEVICE_TIMEOUT_USEC, NULL);
+ 0, NULL);
if (rc)
dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc);
@@ -87,7 +87,7 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
pkt.value = cpu_to_le64(state);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- HL_DEVICE_TIMEOUT_USEC, NULL);
+ 0, NULL);
if (rc)
dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc);
@@ -480,7 +480,7 @@ out:
return 0;
}
-static ssize_t mmu_write(struct file *file, const char __user *buf,
+static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
size_t count, loff_t *f_pos)
{
struct seq_file *s = file->private_data;
@@ -981,7 +981,7 @@ static ssize_t hl_clk_gate_read(struct file *f, char __user *buf,
if (*ppos)
return 0;
- sprintf(tmp_buf, "%d\n", hdev->clock_gating);
+ sprintf(tmp_buf, "0x%llx\n", hdev->clock_gating_mask);
rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
strlen(tmp_buf) + 1);
@@ -993,7 +993,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
{
struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
struct hl_device *hdev = entry->hdev;
- u32 value;
+ u64 value;
ssize_t rc;
if (atomic_read(&hdev->in_reset)) {
@@ -1002,19 +1002,12 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
return 0;
}
- rc = kstrtouint_from_user(buf, count, 10, &value);
+ rc = kstrtoull_from_user(buf, count, 16, &value);
if (rc)
return rc;
- if (value) {
- hdev->clock_gating = 1;
- if (hdev->asic_funcs->enable_clock_gating)
- hdev->asic_funcs->enable_clock_gating(hdev);
- } else {
- if (hdev->asic_funcs->disable_clock_gating)
- hdev->asic_funcs->disable_clock_gating(hdev);
- hdev->clock_gating = 0;
- }
+ hdev->clock_gating_mask = value;
+ hdev->asic_funcs->set_clock_gating(hdev);
return count;
}
@@ -1125,7 +1118,7 @@ static const struct hl_info_list hl_debugfs_list[] = {
{"command_submission_jobs", command_submission_jobs_show, NULL},
{"userptr", userptr_show, NULL},
{"vm", vm_show, NULL},
- {"mmu", mmu_show, mmu_write},
+ {"mmu", mmu_show, mmu_asid_va_write},
{"engines", engines_show, NULL}
};
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/common/device.c
index 2b38a119704c..be16b75bdfdb 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -249,7 +249,8 @@ static void device_cdev_sysfs_del(struct hl_device *hdev)
*/
static int device_early_init(struct hl_device *hdev)
{
- int rc;
+ int i, rc;
+ char workq_name[32];
switch (hdev->asic_type) {
case ASIC_GOYA:
@@ -274,11 +275,24 @@ static int device_early_init(struct hl_device *hdev)
if (rc)
goto early_fini;
- hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
- if (hdev->cq_wq == NULL) {
- dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
- rc = -ENOMEM;
- goto asid_fini;
+ if (hdev->asic_prop.completion_queues_count) {
+ hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count,
+ sizeof(*hdev->cq_wq),
+ GFP_ATOMIC);
+ if (!hdev->cq_wq) {
+ rc = -ENOMEM;
+ goto asid_fini;
+ }
+ }
+
+ for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
+ snprintf(workq_name, 32, "hl-free-jobs-%u", i);
+ hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
+ if (hdev->cq_wq == NULL) {
+ dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
+ rc = -ENOMEM;
+ goto free_cq_wq;
+ }
}
hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
@@ -321,7 +335,10 @@ free_chip_info:
free_eq_wq:
destroy_workqueue(hdev->eq_wq);
free_cq_wq:
- destroy_workqueue(hdev->cq_wq);
+ for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
+ if (hdev->cq_wq[i])
+ destroy_workqueue(hdev->cq_wq[i]);
+ kfree(hdev->cq_wq);
asid_fini:
hl_asid_fini(hdev);
early_fini:
@@ -339,6 +356,8 @@ early_fini:
*/
static void device_early_fini(struct hl_device *hdev)
{
+ int i;
+
mutex_destroy(&hdev->mmu_cache_lock);
mutex_destroy(&hdev->debug_lock);
mutex_destroy(&hdev->send_cpu_message_lock);
@@ -351,7 +370,10 @@ static void device_early_fini(struct hl_device *hdev)
kfree(hdev->hl_chip_info);
destroy_workqueue(hdev->eq_wq);
- destroy_workqueue(hdev->cq_wq);
+
+ for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
+ destroy_workqueue(hdev->cq_wq[i]);
+ kfree(hdev->cq_wq);
hl_asid_fini(hdev);
@@ -608,7 +630,7 @@ int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
hdev->in_debug = 0;
if (!hdev->hard_reset_pending)
- hdev->asic_funcs->enable_clock_gating(hdev);
+ hdev->asic_funcs->set_clock_gating(hdev);
goto out;
}
@@ -838,6 +860,22 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
if (rc)
return 0;
+ if (hard_reset) {
+ /* Disable PCI access from device F/W so he won't send
+ * us additional interrupts. We disable MSI/MSI-X at
+ * the halt_engines function and we can't have the F/W
+ * sending us interrupts after that. We need to disable
+ * the access here because if the device is marked
+ * disable, the message won't be send. Also, in case
+ * of heartbeat, the device CPU is marked as disable
+ * so this message won't be sent
+ */
+ if (hl_fw_send_pci_access_msg(hdev,
+ ARMCP_PACKET_DISABLE_PCI_ACCESS))
+ dev_warn(hdev->dev,
+ "Failed to disable PCI access by F/W\n");
+ }
+
/* This also blocks future CS/VM/JOB completion operations */
hdev->disabled = true;
@@ -995,6 +1033,12 @@ again:
}
}
+ /* Device is now enabled as part of the initialization requires
+ * communication with the device firmware to get information that
+ * is required for the initialization itself
+ */
+ hdev->disabled = false;
+
rc = hdev->asic_funcs->hw_init(hdev);
if (rc) {
dev_err(hdev->dev,
@@ -1002,8 +1046,6 @@ again:
goto out_err;
}
- hdev->disabled = false;
-
/* Check that the communication with the device is working */
rc = hdev->asic_funcs->test_queues(hdev);
if (rc) {
@@ -1144,14 +1186,17 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
* because there the addresses of the completion queues are being
* passed as arguments to request_irq
*/
- hdev->completion_queue = kcalloc(cq_cnt,
- sizeof(*hdev->completion_queue),
- GFP_KERNEL);
+ if (cq_cnt) {
+ hdev->completion_queue = kcalloc(cq_cnt,
+ sizeof(*hdev->completion_queue),
+ GFP_KERNEL);
- if (!hdev->completion_queue) {
- dev_err(hdev->dev, "failed to allocate completion queues\n");
- rc = -ENOMEM;
- goto hw_queues_destroy;
+ if (!hdev->completion_queue) {
+ dev_err(hdev->dev,
+ "failed to allocate completion queues\n");
+ rc = -ENOMEM;
+ goto hw_queues_destroy;
+ }
}
for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) {
@@ -1162,6 +1207,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
"failed to initialize completion queue\n");
goto cq_fini;
}
+ hdev->completion_queue[i].cq_idx = i;
}
/*
@@ -1219,6 +1265,12 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
*/
add_cdev_sysfs_on_err = true;
+ /* Device is now enabled as part of the initialization requires
+ * communication with the device firmware to get information that
+ * is required for the initialization itself
+ */
+ hdev->disabled = false;
+
rc = hdev->asic_funcs->hw_init(hdev);
if (rc) {
dev_err(hdev->dev, "failed to initialize the H/W\n");
@@ -1226,8 +1278,6 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
goto out_disabled;
}
- hdev->disabled = false;
-
/* Check that the communication with the device is working */
rc = hdev->asic_funcs->test_queues(hdev);
if (rc) {
diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index baf790cf4b78..f70302cdab1b 100644
--- a/drivers/misc/habanalabs/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -6,7 +6,7 @@
*/
#include "habanalabs.h"
-#include "include/hl_boot_if.h"
+#include "../include/common/hl_boot_if.h"
#include <linux/firmware.h>
#include <linux/genalloc.h>
@@ -15,7 +15,10 @@
/**
* hl_fw_load_fw_to_device() - Load F/W code to device's memory.
+ *
* @hdev: pointer to hl_device structure.
+ * @fw_name: the firmware image name
+ * @dst: IO memory mapped address space to copy firmware to
*
* Copy fw code from firmware file to device memory.
*
@@ -61,7 +64,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);
return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
- sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL);
+ sizeof(pkt), 0, NULL);
}
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
@@ -144,7 +147,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
pkt.value = cpu_to_le64(event_type);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- HL_DEVICE_TIMEOUT_USEC, &result);
+ 0, &result);
if (rc)
dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
@@ -183,7 +186,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
ARMCP_PKT_CTL_OPCODE_SHIFT);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
- total_pkt_size, HL_DEVICE_TIMEOUT_USEC, &result);
+ total_pkt_size, 0, &result);
if (rc)
dev_err(hdev->dev, "failed to unmask IRQ array\n");
@@ -204,7 +207,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
- sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
+ sizeof(test_pkt), 0, &result);
if (!rc) {
if (result != ARMCP_PACKET_FENCE_VAL)
@@ -248,7 +251,7 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
- sizeof(hb_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
+ sizeof(hb_pkt), 0, &result);
if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
rc = -EIO;
@@ -286,7 +289,7 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)
HL_ARMCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
dev_err(hdev->dev,
- "Failed to send ArmCP info pkt, error %d\n", rc);
+ "Failed to handle ArmCP info pkt, error %d\n", rc);
goto out;
}
@@ -337,7 +340,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
if (rc) {
dev_err(hdev->dev,
- "Failed to send ArmCP EEPROM packet, error %d\n", rc);
+ "Failed to handle ArmCP EEPROM packet, error %d\n", rc);
goto out;
}
@@ -390,6 +393,53 @@ static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
"Device boot error - NIC F/W initialization failed\n");
}
+static void hl_detect_cpu_boot_status(struct hl_device *hdev, u32 status)
+{
+ switch (status) {
+ case CPU_BOOT_STATUS_NA:
+ dev_err(hdev->dev,
+ "Device boot error - BTL did NOT run\n");
+ break;
+ case CPU_BOOT_STATUS_IN_WFE:
+ dev_err(hdev->dev,
+ "Device boot error - Stuck inside WFE loop\n");
+ break;
+ case CPU_BOOT_STATUS_IN_BTL:
+ dev_err(hdev->dev,
+ "Device boot error - Stuck in BTL\n");
+ break;
+ case CPU_BOOT_STATUS_IN_PREBOOT:
+ dev_err(hdev->dev,
+ "Device boot error - Stuck in Preboot\n");
+ break;
+ case CPU_BOOT_STATUS_IN_SPL:
+ dev_err(hdev->dev,
+ "Device boot error - Stuck in SPL\n");
+ break;
+ case CPU_BOOT_STATUS_IN_UBOOT:
+ dev_err(hdev->dev,
+ "Device boot error - Stuck in u-boot\n");
+ break;
+ case CPU_BOOT_STATUS_DRAM_INIT_FAIL:
+ dev_err(hdev->dev,
+ "Device boot error - DRAM initialization failed\n");
+ break;
+ case CPU_BOOT_STATUS_UBOOT_NOT_READY:
+ dev_err(hdev->dev,
+ "Device boot error - u-boot stopped by user\n");
+ break;
+ case CPU_BOOT_STATUS_TS_INIT_FAIL:
+ dev_err(hdev->dev,
+ "Device boot error - Thermal Sensor initialization failed\n");
+ break;
+ default:
+ dev_err(hdev->dev,
+ "Device boot error - Invalid status code %d\n",
+ status);
+ break;
+ }
+}
+
int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
u32 boot_err0_reg, bool skip_bmc,
@@ -463,50 +513,7 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
* versions but we keep them here for backward compatibility
*/
if (rc) {
- switch (status) {
- case CPU_BOOT_STATUS_NA:
- dev_err(hdev->dev,
- "Device boot error - BTL did NOT run\n");
- break;
- case CPU_BOOT_STATUS_IN_WFE:
- dev_err(hdev->dev,
- "Device boot error - Stuck inside WFE loop\n");
- break;
- case CPU_BOOT_STATUS_IN_BTL:
- dev_err(hdev->dev,
- "Device boot error - Stuck in BTL\n");
- break;
- case CPU_BOOT_STATUS_IN_PREBOOT:
- dev_err(hdev->dev,
- "Device boot error - Stuck in Preboot\n");
- break;
- case CPU_BOOT_STATUS_IN_SPL:
- dev_err(hdev->dev,
- "Device boot error - Stuck in SPL\n");
- break;
- case CPU_BOOT_STATUS_IN_UBOOT:
- dev_err(hdev->dev,
- "Device boot error - Stuck in u-boot\n");
- break;
- case CPU_BOOT_STATUS_DRAM_INIT_FAIL:
- dev_err(hdev->dev,
- "Device boot error - DRAM initialization failed\n");
- break;
- case CPU_BOOT_STATUS_UBOOT_NOT_READY:
- dev_err(hdev->dev,
- "Device boot error - u-boot stopped by user\n");
- break;
- case CPU_BOOT_STATUS_TS_INIT_FAIL:
- dev_err(hdev->dev,
- "Device boot error - Thermal Sensor initialization failed\n");
- break;
- default:
- dev_err(hdev->dev,
- "Device boot error - Invalid status code %d\n",
- status);
- break;
- }
-
+ hl_detect_cpu_boot_status(hdev, status);
rc = -EIO;
goto out;
}
@@ -566,7 +573,8 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
"Device reports FIT image is corrupted\n");
else
dev_err(hdev->dev,
- "Device failed to load, %d\n", status);
+ "Failed to load firmware to device, %d\n",
+ status);
rc = -EIO;
goto out;
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 1ecdcf8b763a..018d9d67e8e6 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -8,8 +8,9 @@
#ifndef HABANALABSP_H_
#define HABANALABSP_H_
-#include "include/armcp_if.h"
-#include "include/qman_if.h"
+#include "../include/common/armcp_if.h"
+#include "../include/common/qman_if.h"
+#include <uapi/misc/habanalabs.h>
#include <linux/cdev.h>
#include <linux/iopoll.h>
@@ -40,11 +41,6 @@
#define HL_SIM_MAX_TIMEOUT_US 10000000 /* 10s */
-#define HL_MAX_QUEUES 128
-
-/* MUST BE POWER OF 2 and larger than 1 */
-#define HL_MAX_PENDING_CS 64
-
#define HL_IDLE_BUSY_TS_ARR_SIZE 4096
/* Memory */
@@ -53,6 +49,10 @@
/* MMU */
#define MMU_HASH_TABLE_BITS 7 /* 1 << 7 buckets */
+/*
+ * HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream
+ * HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream
+ */
#define HL_RSVD_SOBS 4
#define HL_RSVD_MONS 2
@@ -61,6 +61,11 @@
#define HL_MAX_SOB_VAL (1 << 15)
+#define IS_POWER_OF_2(n) (n != 0 && ((n & (n - 1)) == 0))
+#define IS_MAX_PENDING_CS_VALID(n) (IS_POWER_OF_2(n) && (n > 1))
+
+#define HL_PCI_NUM_BARS 6
+
/**
* struct pgt_info - MMU hop page info.
* @node: hash linked-list node for the pgts shadow hash of pgts.
@@ -86,6 +91,16 @@ struct hl_device;
struct hl_fpriv;
/**
+ * enum hl_pci_match_mode - pci match mode per region
+ * @PCI_ADDRESS_MATCH_MODE: address match mode
+ * @PCI_BAR_MATCH_MODE: bar match mode
+ */
+enum hl_pci_match_mode {
+ PCI_ADDRESS_MATCH_MODE,
+ PCI_BAR_MATCH_MODE
+};
+
+/**
* enum hl_fw_component - F/W components to read version through registers.
* @FW_COMP_UBOOT: u-boot.
* @FW_COMP_PREBOOT: preboot.
@@ -121,6 +136,32 @@ enum hl_cs_type {
};
/*
+ * struct hl_inbound_pci_region - inbound region descriptor
+ * @mode: pci match mode for this region
+ * @addr: region target address
+ * @size: region size in bytes
+ * @offset_in_bar: offset within bar (address match mode)
+ * @bar: bar id
+ */
+struct hl_inbound_pci_region {
+ enum hl_pci_match_mode mode;
+ u64 addr;
+ u64 size;
+ u64 offset_in_bar;
+ u8 bar;
+};
+
+/*
+ * struct hl_outbound_pci_region - outbound region descriptor
+ * @addr: region target address
+ * @size: region size in bytes
+ */
+struct hl_outbound_pci_region {
+ u64 addr;
+ u64 size;
+};
+
+/*
* struct hl_hw_sob - H/W SOB info.
* @hdev: habanalabs device structure.
* @kref: refcount of this SOB. The SOB will reset once the refcount is zero.
@@ -141,11 +182,13 @@ struct hl_hw_sob {
* false otherwise.
* @requires_kernel_cb: true if a CB handle must be provided for jobs on this
* queue, false otherwise (a CB address must be provided).
+ * @supports_sync_stream: True if queue supports sync stream
*/
struct hw_queue_properties {
enum hl_queue_type type;
u8 driver_only;
u8 requires_kernel_cb;
+ u8 supports_sync_stream;
};
/**
@@ -241,14 +284,19 @@ struct hl_mmu_properties {
* @psoc_pci_pll_nf: PCI PLL NF value.
* @psoc_pci_pll_od: PCI PLL OD value.
* @psoc_pci_pll_div_factor: PCI PLL DIV FACTOR 1 value.
+ * @psoc_timestamp_frequency: frequency of the psoc timestamp clock.
* @high_pll: high PLL frequency used by the device.
* @cb_pool_cb_cnt: number of CBs in the CB pool.
* @cb_pool_cb_size: size of each CB in the CB pool.
+ * @max_pending_cs: maximum of concurrent pending command submissions
+ * @max_queues: maximum amount of queues in the system
+ * @sync_stream_first_sob: first sync object available for sync stream use
+ * @sync_stream_first_mon: first monitor available for sync stream use
* @tpc_enabled_mask: which TPCs are enabled.
* @completion_queues_count: number of completion queues.
*/
struct asic_fixed_properties {
- struct hw_queue_properties hw_queues_props[HL_MAX_QUEUES];
+ struct hw_queue_properties *hw_queues_props;
struct armcp_info armcp_info;
char uboot_ver[VERSION_MAX_LEN];
char preboot_ver[VERSION_MAX_LEN];
@@ -282,9 +330,14 @@ struct asic_fixed_properties {
u32 psoc_pci_pll_nf;
u32 psoc_pci_pll_od;
u32 psoc_pci_pll_div_factor;
+ u32 psoc_timestamp_frequency;
u32 high_pll;
u32 cb_pool_cb_cnt;
u32 cb_pool_cb_size;
+ u32 max_pending_cs;
+ u32 max_queues;
+ u16 sync_stream_first_sob;
+ u16 sync_stream_first_mon;
u8 tpc_enabled_mask;
u8 completion_queues_count;
};
@@ -339,6 +392,7 @@ struct hl_cb_mgr {
* @ctx_id: holds the ID of the owner's context.
* @mmap: true if the CB is currently mmaped to user.
* @is_pool: true if CB was acquired from the pool, false otherwise.
+ * @is_internal: internaly allocated
*/
struct hl_cb {
struct kref refcount;
@@ -355,6 +409,7 @@ struct hl_cb {
u32 ctx_id;
u8 mmap;
u8 is_pool;
+ u8 is_internal;
};
@@ -364,38 +419,19 @@ struct hl_cb {
struct hl_cs_job;
-/*
- * Currently, there are two limitations on the maximum length of a queue:
- *
- * 1. The memory footprint of the queue. The current allocated space for the
- * queue is PAGE_SIZE. Because each entry in the queue is HL_BD_SIZE,
- * the maximum length of the queue can be PAGE_SIZE / HL_BD_SIZE,
- * which currently is 4096/16 = 256 entries.
- *
- * To increase that, we need either to decrease the size of the
- * BD (difficult), or allocate more than a single page (easier).
- *
- * 2. Because the size of the JOB handle field in the BD CTL / completion queue
- * is 10-bit, we can have up to 1024 open jobs per hardware queue.
- * Therefore, each queue can hold up to 1024 entries.
- *
- * HL_QUEUE_LENGTH is in units of struct hl_bd.
- * HL_QUEUE_LENGTH * sizeof(struct hl_bd) should be <= HL_PAGE_SIZE
- */
-
-#define HL_PAGE_SIZE 4096 /* minimum page size */
-/* Must be power of 2 (HL_PAGE_SIZE / HL_BD_SIZE) */
-#define HL_QUEUE_LENGTH 256
+/* Queue length of external and HW queues */
+#define HL_QUEUE_LENGTH 4096
#define HL_QUEUE_SIZE_IN_BYTES (HL_QUEUE_LENGTH * HL_BD_SIZE)
-/*
- * HL_CQ_LENGTH is in units of struct hl_cq_entry.
- * HL_CQ_LENGTH should be <= HL_PAGE_SIZE
- */
+#if (HL_MAX_JOBS_PER_CS > HL_QUEUE_LENGTH)
+#error "HL_QUEUE_LENGTH must be greater than HL_MAX_JOBS_PER_CS"
+#endif
+
+/* HL_CQ_LENGTH is in units of struct hl_cq_entry */
#define HL_CQ_LENGTH HL_QUEUE_LENGTH
#define HL_CQ_SIZE_IN_BYTES (HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE)
-/* Must be power of 2 (HL_PAGE_SIZE / HL_EQ_ENTRY_SIZE) */
+/* Must be power of 2 */
#define HL_EQ_LENGTH 64
#define HL_EQ_SIZE_IN_BYTES (HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE)
@@ -422,6 +458,7 @@ struct hl_cs_job;
* exist).
* @curr_sob_offset: the id offset to the currently used SOB from the
* HL_RSVD_SOBS that are being used by this queue.
+ * @supports_sync_stream: True if queue supports sync stream
*/
struct hl_hw_queue {
struct hl_hw_sob hw_sob[HL_RSVD_SOBS];
@@ -430,7 +467,7 @@ struct hl_hw_queue {
u64 kernel_address;
dma_addr_t bus_address;
u32 pi;
- u32 ci;
+ atomic_t ci;
u32 hw_queue_id;
u32 cq_id;
u32 msi_vec;
@@ -440,6 +477,7 @@ struct hl_hw_queue {
u16 base_mon_id;
u8 valid;
u8 curr_sob_offset;
+ u8 supports_sync_stream;
};
/**
@@ -447,6 +485,7 @@ struct hl_hw_queue {
* @hdev: pointer to the device structure
* @kernel_address: holds the queue's kernel virtual address
* @bus_address: holds the queue's DMA address
+ * @cq_idx: completion queue index in array
* @hw_queue_id: the id of the matching H/W queue
* @ci: ci inside the queue
* @pi: pi inside the queue
@@ -456,6 +495,7 @@ struct hl_cq {
struct hl_device *hdev;
u64 kernel_address;
dma_addr_t bus_address;
+ u32 cq_idx;
u32 hw_queue_id;
u32 ci;
u32 pi;
@@ -519,6 +559,15 @@ enum hl_pll_frequency {
PLL_LAST
};
+#define PLL_REF_CLK 50
+
+enum div_select_defs {
+ DIV_SEL_REF_CLK = 0,
+ DIV_SEL_PLL_CLK = 1,
+ DIV_SEL_DIVIDED_REF = 2,
+ DIV_SEL_DIVIDED_PLL = 3,
+};
+
/**
* struct hl_asic_funcs - ASIC specific functions that are can be called from
* common code.
@@ -578,8 +627,9 @@ enum hl_pll_frequency {
* @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with
* ASID-VA-size mask.
* @send_heartbeat: send is-alive packet to ArmCP and verify response.
- * @enable_clock_gating: enable clock gating for reducing power consumption.
- * @disable_clock_gating: disable clock for accessing registers on HBW.
+ * @set_clock_gating: enable/disable clock gating per engine according to
+ * clock gating mask in hdev
+ * @disable_clock_gating: disable clock gating completely
* @debug_coresight: perform certain actions on Coresight for debugging.
* @is_device_idle: return true if device is idle, false otherwise.
* @soft_reset_late_init: perform certain actions needed after soft reset.
@@ -587,7 +637,11 @@ enum hl_pll_frequency {
* @hw_queues_unlock: release H/W queues lock.
* @get_pci_id: retrieve PCI ID.
* @get_eeprom_data: retrieve EEPROM data from F/W.
- * @send_cpu_message: send buffer to ArmCP.
+ * @send_cpu_message: send message to F/W. If the message is timedout, the
+ * driver will eventually reset the device. The timeout can
+ * be determined by the calling function or it can be 0 and
+ * then the timeout is the default timeout for the specific
+ * ASIC
* @get_hw_state: retrieve the H/W state
* @pci_bars_map: Map PCI BARs.
* @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
@@ -596,14 +650,13 @@ enum hl_pll_frequency {
* @rreg: Read a register. Needed for simulator support.
* @wreg: Write a register. Needed for simulator support.
* @halt_coresight: stop the ETF and ETR traces.
+ * @ctx_init: context dependent initialization.
* @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz
* @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index.
* @read_device_fw_version: read the device's firmware versions that are
* contained in registers
* @load_firmware_to_device: load the firmware to the device's memory
* @load_boot_fit_to_device: load boot fit to device's memory
- * @ext_queue_init: Initialize the given external queue.
- * @ext_queue_reset: Reset the given external queue.
* @get_signal_cb_size: Get signal CB size.
* @get_wait_cb_size: Get wait CB size.
* @gen_signal_cb: Generate a signal CB.
@@ -680,7 +733,7 @@ struct hl_asic_funcs {
int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
u32 asid, u64 va, u64 size);
int (*send_heartbeat)(struct hl_device *hdev);
- void (*enable_clock_gating)(struct hl_device *hdev);
+ void (*set_clock_gating)(struct hl_device *hdev);
void (*disable_clock_gating)(struct hl_device *hdev);
int (*debug_coresight)(struct hl_device *hdev, void *data);
bool (*is_device_idle)(struct hl_device *hdev, u32 *mask,
@@ -700,14 +753,13 @@ struct hl_asic_funcs {
u32 (*rreg)(struct hl_device *hdev, u32 reg);
void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
void (*halt_coresight)(struct hl_device *hdev);
+ int (*ctx_init)(struct hl_ctx *ctx);
int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx);
void (*read_device_fw_version)(struct hl_device *hdev,
enum hl_fw_component fwc);
int (*load_firmware_to_device)(struct hl_device *hdev);
int (*load_boot_fit_to_device)(struct hl_device *hdev);
- void (*ext_queue_init)(struct hl_device *hdev, u32 hw_queue_id);
- void (*ext_queue_reset)(struct hl_device *hdev, u32 hw_queue_id);
u32 (*get_signal_cb_size)(struct hl_device *hdev);
u32 (*get_wait_cb_size)(struct hl_device *hdev);
void (*gen_signal_cb)(struct hl_device *hdev, void *data, u16 sob_id);
@@ -743,7 +795,6 @@ struct hl_va_range {
* struct hl_ctx - user/kernel context.
* @mem_hash: holds mapping from virtual address to virtual memory area
* descriptor (hl_vm_phys_pg_list or hl_userptr).
- * @mmu_phys_hash: holds a mapping from physical address to pgt_info structure.
* @mmu_shadow_hash: holds a mapping from shadow address to pgt_info structure.
* @hpriv: pointer to the private (Kernel Driver) data of the process (fd).
* @hdev: pointer to the device structure.
@@ -777,18 +828,18 @@ struct hl_va_range {
*/
struct hl_ctx {
DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS);
- DECLARE_HASHTABLE(mmu_phys_hash, MMU_HASH_TABLE_BITS);
DECLARE_HASHTABLE(mmu_shadow_hash, MMU_HASH_TABLE_BITS);
struct hl_fpriv *hpriv;
struct hl_device *hdev;
struct kref refcount;
- struct dma_fence *cs_pending[HL_MAX_PENDING_CS];
+ struct dma_fence **cs_pending;
struct hl_va_range *host_va_range;
struct hl_va_range *host_huge_va_range;
struct hl_va_range *dram_va_range;
struct mutex mem_hash_lock;
struct mutex mmu_lock;
struct list_head debugfs_list;
+ struct hl_cs_counters cs_counters;
u64 cs_sequence;
u64 *dram_default_hops;
spinlock_t cs_lock;
@@ -863,7 +914,7 @@ struct hl_userptr {
* @aborted: true if CS was aborted due to some device error.
*/
struct hl_cs {
- u16 jobs_in_queue_cnt[HL_MAX_QUEUES];
+ u16 *jobs_in_queue_cnt;
struct hl_ctx *ctx;
struct list_head job_list;
spinlock_t job_lock;
@@ -1347,7 +1398,9 @@ struct hl_device_idle_busy_ts {
/**
* struct hl_device - habanalabs device structure.
* @pdev: pointer to PCI device, can be NULL in case of simulator device.
- * @pcie_bar: array of available PCIe bars.
+ * @pcie_bar_phys: array of available PCIe bars physical addresses.
+ * (required only for PCI address match mode)
+ * @pcie_bar: array of available PCIe bars virtual addresses.
* @rmmio: configuration area address on SRAM.
* @cdev: related char device.
* @cdev_ctrl: char device for control operations only (INFO IOCTL)
@@ -1358,7 +1411,8 @@ struct hl_device_idle_busy_ts {
* @asic_name: ASIC specific nmae.
* @asic_type: ASIC specific type.
* @completion_queue: array of hl_cq.
- * @cq_wq: work queue of completion queues for executing work in process context
+ * @cq_wq: work queues of completion queues for executing work in process
+ * context.
* @eq_wq: work queue of event queue for executing work in process context.
* @kernel_ctx: Kernel driver context structure.
* @kernel_queues: array of hl_hw_queue.
@@ -1387,17 +1441,25 @@ struct hl_device_idle_busy_ts {
* @hl_debugfs: device's debugfs manager.
* @cb_pool: list of preallocated CBs.
* @cb_pool_lock: protects the CB pool.
+ * @internal_cb_pool_virt_addr: internal command buffer pool virtual address.
+ * @internal_cb_pool_dma_addr: internal command buffer pool dma address.
+ * @internal_cb_pool: internal command buffer memory pool.
+ * @internal_cb_va_base: internal cb pool mmu virtual address base
* @fpriv_list: list of file private data structures. Each structure is created
* when a user opens the device
* @fpriv_list_lock: protects the fpriv_list
* @compute_ctx: current compute context executing.
* @idle_busy_ts_arr: array to hold time stamps of transitions from idle to busy
* and vice-versa
+ * @aggregated_cs_counters: aggregated cs counters among all contexts
* @dram_used_mem: current DRAM memory consumption.
* @timeout_jiffies: device CS timeout value.
* @max_power: the max power of the device, as configured by the sysadmin. This
* value is saved so in case of hard-reset, the driver will restore
* this value and update the F/W after the re-initialization
+ * @clock_gating_mask: is clock gating enabled. bitmask that represents the
+ * different engines. See debugfs-driver-habanalabs for
+ * details.
* @in_reset: is device in reset flow.
* @curr_pll_profile: current PLL profile.
* @cs_active_cnt: number of active command submissions on this device (active
@@ -1425,7 +1487,6 @@ struct hl_device_idle_busy_ts {
* @init_done: is the initialization of the device done.
* @mmu_enable: is MMU enabled.
* @mmu_huge_page_opt: is MMU huge pages optimization enabled.
- * @clock_gating: is clock gating enabled.
* @device_cpu_disabled: is the device CPU disabled (due to timeouts)
* @dma_mask: the dma mask that was set for this device
* @in_debug: is device under debug. This, together with fpriv_list, enforces
@@ -1435,12 +1496,14 @@ struct hl_device_idle_busy_ts {
* @cdev_sysfs_created: were char devices and sysfs nodes created.
* @stop_on_err: true if engines should stop on error.
* @supports_sync_stream: is sync stream supported.
+ * @sync_stream_queue_idx: helper index for sync stream queues initialization.
* @supports_coresight: is CoreSight supported.
* @supports_soft_reset: is soft reset supported.
*/
struct hl_device {
struct pci_dev *pdev;
- void __iomem *pcie_bar[6];
+ u64 pcie_bar_phys[HL_PCI_NUM_BARS];
+ void __iomem *pcie_bar[HL_PCI_NUM_BARS];
void __iomem *rmmio;
struct cdev cdev;
struct cdev cdev_ctrl;
@@ -1451,7 +1514,7 @@ struct hl_device {
char asic_name[16];
enum hl_asic_type asic_type;
struct hl_cq *completion_queue;
- struct workqueue_struct *cq_wq;
+ struct workqueue_struct **cq_wq;
struct workqueue_struct *eq_wq;
struct hl_ctx *kernel_ctx;
struct hl_hw_queue *kernel_queues;
@@ -1483,6 +1546,11 @@ struct hl_device {
struct list_head cb_pool;
spinlock_t cb_pool_lock;
+ void *internal_cb_pool_virt_addr;
+ dma_addr_t internal_cb_pool_dma_addr;
+ struct gen_pool *internal_cb_pool;
+ u64 internal_cb_va_base;
+
struct list_head fpriv_list;
struct mutex fpriv_list_lock;
@@ -1490,9 +1558,12 @@ struct hl_device {
struct hl_device_idle_busy_ts *idle_busy_ts_arr;
+ struct hl_cs_counters aggregated_cs_counters;
+
atomic64_t dram_used_mem;
u64 timeout_jiffies;
u64 max_power;
+ u64 clock_gating_mask;
atomic_t in_reset;
enum hl_pll_frequency curr_pll_profile;
int cs_active_cnt;
@@ -1514,7 +1585,6 @@ struct hl_device {
u8 dram_default_page_mapping;
u8 pmmu_huge_range;
u8 init_done;
- u8 clock_gating;
u8 device_cpu_disabled;
u8 dma_mask;
u8 in_debug;
@@ -1522,6 +1592,7 @@ struct hl_device {
u8 cdev_sysfs_created;
u8 stop_on_err;
u8 supports_sync_stream;
+ u8 sync_stream_queue_idx;
u8 supports_coresight;
u8 supports_soft_reset;
@@ -1690,7 +1761,7 @@ int hl_hwmon_init(struct hl_device *hdev);
void hl_hwmon_fini(struct hl_device *hdev);
int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size,
- u64 *handle, int ctx_id);
+ u64 *handle, int ctx_id, bool internal_cb);
int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr,
@@ -1698,7 +1769,8 @@ struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr,
void hl_cb_put(struct hl_cb *cb);
void hl_cb_mgr_init(struct hl_cb_mgr *mgr);
void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr);
-struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size);
+struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
+ bool internal_cb);
int hl_cb_pool_init(struct hl_device *hdev);
int hl_cb_pool_fini(struct hl_device *hdev);
@@ -1762,9 +1834,10 @@ int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data);
int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
u64 addr);
-int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
- u64 dram_base_address, u64 host_phys_base_address,
- u64 host_phys_size);
+int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region,
+ struct hl_inbound_pci_region *pci_region);
+int hl_pci_set_outbound_region(struct hl_device *hdev,
+ struct hl_outbound_pci_region *pci_region);
int hl_pci_init(struct hl_device *hdev);
void hl_pci_fini(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 8652c7e5d7f1..c6b31e93fb5e 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -232,13 +232,12 @@ static void set_driver_behavior_per_device(struct hl_device *hdev)
hdev->fw_loading = 1;
hdev->cpu_queues_enable = 1;
hdev->heartbeat = 1;
- hdev->clock_gating = 1;
+ hdev->clock_gating_mask = ULONG_MAX;
hdev->reset_pcilink = 0;
hdev->axi_drain = 0;
hdev->sram_scrambler_enable = 1;
hdev->dram_scrambler_enable = 1;
- hdev->rl_enable = 1;
hdev->bmc_enable = 1;
hdev->hard_reset_on_fw_events = 1;
}
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 52eedd3a6c3a..5af1c03da473 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -276,6 +276,27 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args)
min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
}
+static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ struct hl_info_cs_counters cs_counters = {0};
+ u32 max_size = args->return_size;
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+ if ((!max_size) || (!out))
+ return -EINVAL;
+
+ memcpy(&cs_counters.cs_counters, &hdev->aggregated_cs_counters,
+ sizeof(struct hl_cs_counters));
+
+ if (hpriv->ctx)
+ memcpy(&cs_counters.ctx_cs_counters, &hpriv->ctx->cs_counters,
+ sizeof(struct hl_cs_counters));
+
+ return copy_to_user(out, &cs_counters,
+ min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0;
+}
+
static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
struct device *dev)
{
@@ -336,6 +357,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_TIME_SYNC:
return time_sync_info(hdev, args);
+ case HL_INFO_CS_COUNTERS:
+ return cs_counters_info(hpriv, args);
+
default:
dev_err(dev, "Invalid request %d\n", args->op);
rc = -ENOTTY;
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index f4434b39ef1b..287681646071 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -23,10 +23,14 @@ inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val)
ptr &= ((HL_QUEUE_LENGTH << 1) - 1);
return ptr;
}
+static inline int queue_ci_get(atomic_t *ci, u32 queue_len)
+{
+ return atomic_read(ci) & ((queue_len << 1) - 1);
+}
static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len)
{
- int delta = (q->pi - q->ci);
+ int delta = (q->pi - queue_ci_get(&q->ci, queue_len));
if (delta >= 0)
return (queue_len - delta);
@@ -40,21 +44,14 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
struct hl_hw_queue *q;
int i;
- hdev->asic_funcs->hw_queues_lock(hdev);
-
if (hdev->disabled)
- goto out;
+ return;
q = &hdev->kernel_queues[0];
- for (i = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
- if (q->queue_type == QUEUE_TYPE_INT) {
- q->ci += cs->jobs_in_queue_cnt[i];
- q->ci &= ((q->int_queue_len << 1) - 1);
- }
+ for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) {
+ if (q->queue_type == QUEUE_TYPE_INT)
+ atomic_add(cs->jobs_in_queue_cnt[i], &q->ci);
}
-
-out:
- hdev->asic_funcs->hw_queues_unlock(hdev);
}
/*
@@ -161,6 +158,13 @@ static int int_queue_sanity_checks(struct hl_device *hdev,
{
int free_slots_cnt;
+ if (num_of_entries > q->int_queue_len) {
+ dev_err(hdev->dev,
+ "Cannot populate queue %u with %u jobs\n",
+ q->hw_queue_id, num_of_entries);
+ return -ENOMEM;
+ }
+
/* Check we have enough space in the queue */
free_slots_cnt = queue_free_slots(q, q->int_queue_len);
@@ -174,38 +178,26 @@ static int int_queue_sanity_checks(struct hl_device *hdev,
}
/*
- * hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue.
+ * hw_queue_sanity_checks() - Make sure we have enough space in the h/w queue
* @hdev: Pointer to hl_device structure.
* @q: Pointer to hl_hw_queue structure.
* @num_of_entries: How many entries to check for space.
*
- * Perform the following:
- * - Make sure we have enough space in the completion queue.
- * This check also ensures that there is enough space in the h/w queue, as
- * both queues are of the same size.
- * - Reserve space in the completion queue (needs to be reversed if there
- * is a failure down the road before the actual submission of work).
+ * Notice: We do not reserve queue entries so this function mustn't be called
+ * more than once per CS for the same queue
*
- * Both operations are done using the "free_slots_cnt" field of the completion
- * queue. The CI counters of the queue and the completion queue are not
- * needed/used for the H/W queue type.
*/
static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
int num_of_entries)
{
- atomic_t *free_slots =
- &hdev->completion_queue[q->cq_id].free_slots_cnt;
+ int free_slots_cnt;
- /*
- * Check we have enough space in the completion queue.
- * Add -1 to counter (decrement) unless counter was already 0.
- * In that case, CQ is full so we can't submit a new CB.
- * atomic_add_unless will return 0 if counter was already 0.
- */
- if (atomic_add_negative(num_of_entries * -1, free_slots)) {
- dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n",
- num_of_entries, q->hw_queue_id);
- atomic_add(num_of_entries, free_slots);
+ /* Check we have enough space in the queue */
+ free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH);
+
+ if (free_slots_cnt < num_of_entries) {
+ dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
+ q->hw_queue_id, num_of_entries);
return -EAGAIN;
}
@@ -366,7 +358,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
{
struct hl_device *hdev = job->cs->ctx->hdev;
struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
- struct hl_cq *cq;
u64 ptr;
u32 offset, ctl, len;
@@ -376,7 +367,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
* write address offset in the SM block (QMAN LBW message).
* The write address offset is calculated as "COMP_OFFSET << 2".
*/
- offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
+ offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1);
ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
@@ -395,17 +386,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
else
ptr = (u64) (uintptr_t) job->user_cb;
- /*
- * No need to protect pi_offset because scheduling to the
- * H/W queues is done under the scheduler mutex
- *
- * No need to check if CQ is full because it was already
- * checked in hw_queue_sanity_checks
- */
- cq = &hdev->completion_queue[q->cq_id];
-
- cq->pi = hl_cq_inc_ptr(cq->pi);
-
ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
}
@@ -509,19 +489,23 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
struct hl_device *hdev = ctx->hdev;
struct hl_cs_job *job, *tmp;
struct hl_hw_queue *q;
+ u32 max_queues;
int rc = 0, i, cq_cnt;
hdev->asic_funcs->hw_queues_lock(hdev);
if (hl_device_disabled_or_in_reset(hdev)) {
+ ctx->cs_counters.device_in_reset_drop_cnt++;
dev_err(hdev->dev,
"device is disabled or in reset, CS rejected!\n");
rc = -EPERM;
goto out;
}
+ max_queues = hdev->asic_prop.max_queues;
+
q = &hdev->kernel_queues[0];
- for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
+ for (i = 0, cq_cnt = 0 ; i < max_queues ; i++, q++) {
if (cs->jobs_in_queue_cnt[i]) {
switch (q->queue_type) {
case QUEUE_TYPE_EXT:
@@ -543,11 +527,12 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
break;
}
- if (rc)
+ if (rc) {
+ ctx->cs_counters.queue_full_drop_cnt++;
goto unroll_cq_resv;
+ }
- if (q->queue_type == QUEUE_TYPE_EXT ||
- q->queue_type == QUEUE_TYPE_HW)
+ if (q->queue_type == QUEUE_TYPE_EXT)
cq_cnt++;
}
}
@@ -598,10 +583,9 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
unroll_cq_resv:
q = &hdev->kernel_queues[0];
- for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
- if ((q->queue_type == QUEUE_TYPE_EXT ||
- q->queue_type == QUEUE_TYPE_HW) &&
- cs->jobs_in_queue_cnt[i]) {
+ for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) {
+ if ((q->queue_type == QUEUE_TYPE_EXT) &&
+ (cs->jobs_in_queue_cnt[i])) {
atomic_t *free_slots =
&hdev->completion_queue[i].free_slots_cnt;
atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
@@ -625,7 +609,7 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
{
struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
- q->ci = hl_queue_inc_ptr(q->ci);
+ atomic_inc(&q->ci);
}
static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
@@ -660,12 +644,9 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
}
/* Make sure read/write pointers are initialized to start of queue */
- q->ci = 0;
+ atomic_set(&q->ci, 0);
q->pi = 0;
- if (!is_cpu_queue)
- hdev->asic_funcs->ext_queue_init(hdev, q->hw_queue_id);
-
return 0;
free_queue:
@@ -697,7 +678,7 @@ static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
q->kernel_address = (u64) (uintptr_t) p;
q->pi = 0;
- q->ci = 0;
+ atomic_set(&q->ci, 0);
return 0;
}
@@ -726,12 +707,48 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
q->kernel_address = (u64) (uintptr_t) p;
/* Make sure read/write pointers are initialized to start of queue */
- q->ci = 0;
+ atomic_set(&q->ci, 0);
q->pi = 0;
return 0;
}
+static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx)
+{
+ struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
+ struct asic_fixed_properties *prop = &hdev->asic_prop;
+ struct hl_hw_sob *hw_sob;
+ int sob, queue_idx = hdev->sync_stream_queue_idx++;
+
+ hw_queue->base_sob_id =
+ prop->sync_stream_first_sob + queue_idx * HL_RSVD_SOBS;
+ hw_queue->base_mon_id =
+ prop->sync_stream_first_mon + queue_idx * HL_RSVD_MONS;
+ hw_queue->next_sob_val = 1;
+ hw_queue->curr_sob_offset = 0;
+
+ for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) {
+ hw_sob = &hw_queue->hw_sob[sob];
+ hw_sob->hdev = hdev;
+ hw_sob->sob_id = hw_queue->base_sob_id + sob;
+ hw_sob->q_idx = q_idx;
+ kref_init(&hw_sob->kref);
+ }
+}
+
+static void sync_stream_queue_reset(struct hl_device *hdev, u32 q_idx)
+{
+ struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
+
+ /*
+ * In case we got here due to a stuck CS, the refcnt might be bigger
+ * than 1 and therefore we reset it.
+ */
+ kref_init(&hw_queue->hw_sob[hw_queue->curr_sob_offset].kref);
+ hw_queue->curr_sob_offset = 0;
+ hw_queue->next_sob_val = 1;
+}
+
/*
* queue_init - main initialization function for H/W queue object
*
@@ -747,8 +764,6 @@ static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
{
int rc;
- BUILD_BUG_ON(HL_QUEUE_SIZE_IN_BYTES > HL_PAGE_SIZE);
-
q->hw_queue_id = hw_queue_id;
switch (q->queue_type) {
@@ -774,6 +789,9 @@ static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
break;
}
+ if (q->supports_sync_stream)
+ sync_stream_queue_init(hdev, q->hw_queue_id);
+
if (rc)
return rc;
@@ -835,7 +853,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
struct hl_hw_queue *q;
int i, rc, q_ready_cnt;
- hdev->kernel_queues = kcalloc(HL_MAX_QUEUES,
+ hdev->kernel_queues = kcalloc(asic->max_queues,
sizeof(*hdev->kernel_queues), GFP_KERNEL);
if (!hdev->kernel_queues) {
@@ -845,9 +863,11 @@ int hl_hw_queues_create(struct hl_device *hdev)
/* Initialize the H/W queues */
for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues;
- i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {
+ i < asic->max_queues ; i++, q_ready_cnt++, q++) {
q->queue_type = asic->hw_queues_props[i].type;
+ q->supports_sync_stream =
+ asic->hw_queues_props[i].supports_sync_stream;
rc = queue_init(hdev, q, i);
if (rc) {
dev_err(hdev->dev,
@@ -870,9 +890,10 @@ release_queues:
void hl_hw_queues_destroy(struct hl_device *hdev)
{
struct hl_hw_queue *q;
+ u32 max_queues = hdev->asic_prop.max_queues;
int i;
- for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++)
+ for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++)
queue_fini(hdev, q);
kfree(hdev->kernel_queues);
@@ -881,15 +902,17 @@ void hl_hw_queues_destroy(struct hl_device *hdev)
void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset)
{
struct hl_hw_queue *q;
+ u32 max_queues = hdev->asic_prop.max_queues;
int i;
- for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) {
+ for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) {
if ((!q->valid) ||
((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU)))
continue;
- q->pi = q->ci = 0;
+ q->pi = 0;
+ atomic_set(&q->ci, 0);
- if (q->queue_type == QUEUE_TYPE_EXT)
- hdev->asic_funcs->ext_queue_reset(hdev, q->hw_queue_id);
+ if (q->supports_sync_stream)
+ sync_stream_queue_reset(hdev, q->hw_queue_id);
}
}
diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c
index 8c6cd77e6af6..b997336fa75f 100644
--- a/drivers/misc/habanalabs/hwmon.c
+++ b/drivers/misc/habanalabs/common/hwmon.c
@@ -10,7 +10,6 @@
#include <linux/pci.h>
#include <linux/hwmon.h>
-#define SENSORS_PKT_TIMEOUT 1000000 /* 1s */
#define HWMON_NR_SENSOR_TYPES (hwmon_pwm + 1)
int hl_build_hwmon_channel_info(struct hl_device *hdev,
@@ -323,7 +322,7 @@ int hl_get_temperature(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SENSORS_PKT_TIMEOUT, value);
+ 0, value);
if (rc) {
dev_err(hdev->dev,
@@ -350,7 +349,7 @@ int hl_set_temperature(struct hl_device *hdev,
pkt.value = __cpu_to_le64(value);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SENSORS_PKT_TIMEOUT, NULL);
+ 0, NULL);
if (rc)
dev_err(hdev->dev,
@@ -374,7 +373,7 @@ int hl_get_voltage(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SENSORS_PKT_TIMEOUT, value);
+ 0, value);
if (rc) {
dev_err(hdev->dev,
@@ -400,7 +399,7 @@ int hl_get_current(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SENSORS_PKT_TIMEOUT, value);
+ 0, value);
if (rc) {
dev_err(hdev->dev,
@@ -426,7 +425,7 @@ int hl_get_fan_speed(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SENSORS_PKT_TIMEOUT, value);
+ 0, value);
if (rc) {
dev_err(hdev->dev,
@@ -452,7 +451,7 @@ int hl_get_pwm_info(struct hl_device *hdev,
pkt.type = __cpu_to_le16(attr);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SENSORS_PKT_TIMEOUT, value);
+ 0, value);
if (rc) {
dev_err(hdev->dev,
@@ -479,7 +478,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
pkt.value = cpu_to_le64(value);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SENSORS_PKT_TIMEOUT, NULL);
+ 0, NULL);
if (rc)
dev_err(hdev->dev,
@@ -502,7 +501,7 @@ int hl_set_voltage(struct hl_device *hdev,
pkt.value = __cpu_to_le64(value);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SENSORS_PKT_TIMEOUT, NULL);
+ 0, NULL);
if (rc)
dev_err(hdev->dev,
@@ -527,7 +526,7 @@ int hl_set_current(struct hl_device *hdev,
pkt.value = __cpu_to_le64(value);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SENSORS_PKT_TIMEOUT, NULL);
+ 0, NULL);
if (rc)
dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/common/irq.c
index fac65fbd70e8..c8db717023f5 100644
--- a/drivers/misc/habanalabs/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -10,11 +10,12 @@
#include <linux/slab.h>
/**
- * This structure is used to schedule work of EQ entry and armcp_reset event
+ * struct hl_eqe_work - This structure is used to schedule work of EQ
+ * entry and armcp_reset event
*
- * @eq_work - workqueue object to run when EQ entry is received
- * @hdev - pointer to device structure
- * @eq_entry - copy of the EQ entry
+ * @eq_work: workqueue object to run when EQ entry is received
+ * @hdev: pointer to device structure
+ * @eq_entry: copy of the EQ entry
*/
struct hl_eqe_work {
struct work_struct eq_work;
@@ -22,7 +23,7 @@ struct hl_eqe_work {
struct hl_eq_entry eq_entry;
};
-/*
+/**
* hl_cq_inc_ptr - increment ci or pi of cq
*
* @ptr: the current ci or pi value of the completion queue
@@ -38,7 +39,7 @@ inline u32 hl_cq_inc_ptr(u32 ptr)
return ptr;
}
-/*
+/**
* hl_eq_inc_ptr - increment ci of eq
*
* @ptr: the current ci value of the event queue
@@ -65,7 +66,7 @@ static void irq_handle_eqe(struct work_struct *work)
kfree(eqe_work);
}
-/*
+/**
* hl_irq_handler_cq - irq handler for completion queue
*
* @irq: irq number
@@ -118,15 +119,10 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
if ((shadow_index_valid) && (!hdev->disabled)) {
job = queue->shadow_queue[hl_pi_2_offset(shadow_index)];
- queue_work(hdev->cq_wq, &job->finish_work);
+ queue_work(hdev->cq_wq[cq->cq_idx], &job->finish_work);
}
- /* Update ci of the context's queue. There is no
- * need to protect it with spinlock because this update is
- * done only inside IRQ and there is a different IRQ per
- * queue
- */
- queue->ci = hl_queue_inc_ptr(queue->ci);
+ atomic_inc(&queue->ci);
/* Clear CQ entry ready bit */
cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) &
@@ -141,7 +137,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
return IRQ_HANDLED;
}
-/*
+/**
* hl_irq_handler_eq - irq handler for event queue
*
* @irq: irq number
@@ -205,7 +201,7 @@ skip_irq:
return IRQ_HANDLED;
}
-/*
+/**
* hl_cq_init - main initialization function for an cq object
*
* @hdev: pointer to device structure
@@ -219,8 +215,6 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id)
{
void *p;
- BUILD_BUG_ON(HL_CQ_SIZE_IN_BYTES > HL_PAGE_SIZE);
-
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
&q->bus_address, GFP_KERNEL | __GFP_ZERO);
if (!p)
@@ -237,7 +231,7 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id)
return 0;
}
-/*
+/**
* hl_cq_fini - destroy completion queue
*
* @hdev: pointer to device structure
@@ -268,7 +262,7 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q)
memset((void *) (uintptr_t) q->kernel_address, 0, HL_CQ_SIZE_IN_BYTES);
}
-/*
+/**
* hl_eq_init - main initialization function for an event queue object
*
* @hdev: pointer to device structure
@@ -281,8 +275,6 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q)
{
void *p;
- BUILD_BUG_ON(HL_EQ_SIZE_IN_BYTES > HL_PAGE_SIZE);
-
p = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
HL_EQ_SIZE_IN_BYTES,
&q->bus_address);
@@ -296,7 +288,7 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q)
return 0;
}
-/*
+/**
* hl_eq_fini - destroy event queue
*
* @hdev: pointer to device structure
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/common/memory.c
index 47da84a17719..dce9273e557a 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -7,7 +7,7 @@
#include <uapi/misc/habanalabs.h>
#include "habanalabs.h"
-#include "include/hw_ip/mmu/mmu_general.h"
+#include "../include/hw_ip/mmu/mmu_general.h"
#include <linux/uaccess.h>
#include <linux/slab.h>
@@ -1730,8 +1730,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
*/
if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash))
dev_notice(hdev->dev,
- "ctx %d is freed while it has va in use\n",
- ctx->asid);
+ "user released device without removing its memory mappings\n");
hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
dev_dbg(hdev->dev,
diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/common/mmu.c
index a290d6b49d78..edcc11d5eaf1 100644
--- a/drivers/misc/habanalabs/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu.c
@@ -6,7 +6,7 @@
*/
#include "habanalabs.h"
-#include "include/hw_ip/mmu/mmu_general.h"
+#include "../include/hw_ip/mmu/mmu_general.h"
#include <linux/genalloc.h>
#include <linux/slab.h>
@@ -502,7 +502,6 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx)
return 0;
mutex_init(&ctx->mmu_lock);
- hash_init(ctx->mmu_phys_hash);
hash_init(ctx->mmu_shadow_hash);
return dram_default_mapping_init(ctx);
diff --git a/drivers/misc/habanalabs/pci.c b/drivers/misc/habanalabs/common/pci.c
index 9f634ef6f5b3..7bd3737571f3 100644
--- a/drivers/misc/habanalabs/pci.c
+++ b/drivers/misc/habanalabs/common/pci.c
@@ -6,16 +6,22 @@
*/
#include "habanalabs.h"
-#include "include/hw_ip/pci/pci_general.h"
+#include "../include/hw_ip/pci/pci_general.h"
#include <linux/pci.h>
+#include <linux/bitfield.h>
#define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC (HL_PCI_ELBI_TIMEOUT_MSEC * 10)
+#define IATU_REGION_CTRL_REGION_EN_MASK BIT(31)
+#define IATU_REGION_CTRL_MATCH_MODE_MASK BIT(30)
+#define IATU_REGION_CTRL_NUM_MATCH_EN_MASK BIT(19)
+#define IATU_REGION_CTRL_BAR_NUM_MASK GENMASK(10, 8)
+
/**
* hl_pci_bars_map() - Map PCI BARs.
* @hdev: Pointer to hl_device structure.
- * @bar_name: Array of BAR names.
+ * @name: Array of BAR names.
* @is_wc: Array with flag per BAR whether a write-combined mapping is needed.
*
* Request PCI regions and map them to kernel virtual addresses.
@@ -61,7 +67,7 @@ err:
return rc;
}
-/*
+/**
* hl_pci_bars_unmap() - Unmap PCI BARS.
* @hdev: Pointer to hl_device structure.
*
@@ -80,9 +86,11 @@ static void hl_pci_bars_unmap(struct hl_device *hdev)
pci_release_regions(pdev);
}
-/*
+/**
* hl_pci_elbi_write() - Write through the ELBI interface.
* @hdev: Pointer to hl_device structure.
+ * @addr: Address to write to
+ * @data: Data to write
*
* Return: 0 on success, negative value for failure.
*/
@@ -140,6 +148,8 @@ static int hl_pci_elbi_write(struct hl_device *hdev, u64 addr, u32 data)
/**
* hl_pci_iatu_write() - iatu write routine.
* @hdev: Pointer to hl_device structure.
+ * @addr: Address to write to
+ * @data: Data to write
*
* Return: 0 on success, negative value for failure.
*/
@@ -161,7 +171,7 @@ int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data)
return 0;
}
-/*
+/**
* hl_pci_reset_link_through_bridge() - Reset PCI link.
* @hdev: Pointer to hl_device structure.
*/
@@ -183,110 +193,94 @@ static void hl_pci_reset_link_through_bridge(struct hl_device *hdev)
}
/**
- * hl_pci_set_dram_bar_base() - Set DDR BAR to map specific device address.
+ * hl_pci_set_inbound_region() - Configure inbound region
* @hdev: Pointer to hl_device structure.
- * @inbound_region: Inbound region number.
- * @bar: PCI BAR number.
- * @addr: Address in DRAM. Must be aligned to DRAM bar size.
+ * @region: Inbound region number.
+ * @pci_region: Inbound region parameters.
*
- * Configure the iATU so that the DRAM bar will start at the specified address.
+ * Configure the iATU inbound region.
*
* Return: 0 on success, negative value for failure.
*/
-int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
- u64 addr)
+int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region,
+ struct hl_inbound_pci_region *pci_region)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
- u32 offset;
- int rc;
+ u64 bar_phys_base, region_base, region_end_address;
+ u32 offset, ctrl_reg_val;
+ int rc = 0;
- switch (inbound_region) {
- case 0:
- offset = 0x100;
- break;
- case 1:
- offset = 0x300;
- break;
- case 2:
- offset = 0x500;
- break;
- default:
- dev_err(hdev->dev, "Invalid inbound region %d\n",
- inbound_region);
- return -EINVAL;
- }
+ /* region offset */
+ offset = (0x200 * region) + 0x100;
+
+ if (pci_region->mode == PCI_ADDRESS_MATCH_MODE) {
+ bar_phys_base = hdev->pcie_bar_phys[pci_region->bar];
+ region_base = bar_phys_base + pci_region->offset_in_bar;
+ region_end_address = region_base + pci_region->size - 1;
- if (bar != 0 && bar != 2 && bar != 4) {
- dev_err(hdev->dev, "Invalid PCI BAR %d\n", bar);
- return -EINVAL;
+ rc |= hl_pci_iatu_write(hdev, offset + 0x8,
+ lower_32_bits(region_base));
+ rc |= hl_pci_iatu_write(hdev, offset + 0xC,
+ upper_32_bits(region_base));
+ rc |= hl_pci_iatu_write(hdev, offset + 0x10,
+ lower_32_bits(region_end_address));
}
/* Point to the specified address */
- rc = hl_pci_iatu_write(hdev, offset + 0x14, lower_32_bits(addr));
- rc |= hl_pci_iatu_write(hdev, offset + 0x18, upper_32_bits(addr));
+ rc = hl_pci_iatu_write(hdev, offset + 0x14,
+ lower_32_bits(pci_region->addr));
+ rc |= hl_pci_iatu_write(hdev, offset + 0x18,
+ upper_32_bits(pci_region->addr));
rc |= hl_pci_iatu_write(hdev, offset + 0x0, 0);
- /* Enable + BAR match + match enable + BAR number */
- rc |= hl_pci_iatu_write(hdev, offset + 0x4, 0xC0080000 | (bar << 8));
+
+ /* Enable + bar/address match + match enable + bar number */
+ ctrl_reg_val = FIELD_PREP(IATU_REGION_CTRL_REGION_EN_MASK, 1);
+ ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_MATCH_MODE_MASK,
+ pci_region->mode);
+ ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_NUM_MATCH_EN_MASK, 1);
+
+ if (pci_region->mode == PCI_BAR_MATCH_MODE)
+ ctrl_reg_val |= FIELD_PREP(IATU_REGION_CTRL_BAR_NUM_MASK,
+ pci_region->bar);
+
+ rc |= hl_pci_iatu_write(hdev, offset + 0x4, ctrl_reg_val);
/* Return the DBI window to the default location */
rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0);
rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0);
if (rc)
- dev_err(hdev->dev, "failed to map DRAM bar to 0x%08llx\n",
- addr);
+ dev_err(hdev->dev, "failed to map bar %u to 0x%08llx\n",
+ pci_region->bar, pci_region->addr);
return rc;
}
/**
- * hl_pci_init_iatu() - Initialize the iATU unit inside the PCI controller.
+ * hl_pci_set_outbound_region() - Configure outbound region 0
* @hdev: Pointer to hl_device structure.
- * @sram_base_address: SRAM base address.
- * @dram_base_address: DRAM base address.
- * @host_phys_base_address: Base physical address of host memory for device
- * transactions.
- * @host_phys_size: Size of host memory for device transactions.
+ * @pci_region: Outbound region parameters.
*
- * This is needed in case the firmware doesn't initialize the iATU.
+ * Configure the iATU outbound region 0.
*
* Return: 0 on success, negative value for failure.
*/
-int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
- u64 dram_base_address, u64 host_phys_base_address,
- u64 host_phys_size)
+int hl_pci_set_outbound_region(struct hl_device *hdev,
+ struct hl_outbound_pci_region *pci_region)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
- u64 host_phys_end_addr;
+ u64 outbound_region_end_address;
int rc = 0;
- /* Inbound Region 0 - Bar 0 - Point to SRAM base address */
- rc = hl_pci_iatu_write(hdev, 0x114, lower_32_bits(sram_base_address));
- rc |= hl_pci_iatu_write(hdev, 0x118, upper_32_bits(sram_base_address));
- rc |= hl_pci_iatu_write(hdev, 0x100, 0);
- /* Enable + Bar match + match enable */
- rc |= hl_pci_iatu_write(hdev, 0x104, 0xC0080000);
-
- /* Return the DBI window to the default location */
- rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0);
- rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0);
-
- hdev->asic_funcs->set_dma_mask_from_fw(hdev);
-
- /* Point to DRAM */
- if (!hdev->asic_funcs->set_dram_bar_base)
- return -EINVAL;
- if (hdev->asic_funcs->set_dram_bar_base(hdev, dram_base_address) ==
- U64_MAX)
- return -EIO;
-
- /* Outbound Region 0 - Point to Host */
- host_phys_end_addr = host_phys_base_address + host_phys_size - 1;
+ /* Outbound Region 0 */
+ outbound_region_end_address =
+ pci_region->addr + pci_region->size - 1;
rc |= hl_pci_iatu_write(hdev, 0x008,
- lower_32_bits(host_phys_base_address));
+ lower_32_bits(pci_region->addr));
rc |= hl_pci_iatu_write(hdev, 0x00C,
- upper_32_bits(host_phys_base_address));
- rc |= hl_pci_iatu_write(hdev, 0x010, lower_32_bits(host_phys_end_addr));
+ upper_32_bits(pci_region->addr));
+ rc |= hl_pci_iatu_write(hdev, 0x010,
+ lower_32_bits(outbound_region_end_address));
rc |= hl_pci_iatu_write(hdev, 0x014, 0);
if ((hdev->power9_64bit_dma_enable) && (hdev->dma_mask == 64))
@@ -294,7 +288,8 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
else
rc |= hl_pci_iatu_write(hdev, 0x018, 0);
- rc |= hl_pci_iatu_write(hdev, 0x020, upper_32_bits(host_phys_end_addr));
+ rc |= hl_pci_iatu_write(hdev, 0x020,
+ upper_32_bits(outbound_region_end_address));
/* Increase region size */
rc |= hl_pci_iatu_write(hdev, 0x000, 0x00002000);
/* Enable */
@@ -304,16 +299,12 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0);
rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0);
- if (rc)
- return -EIO;
-
- return 0;
+ return rc;
}
/**
* hl_pci_set_dma_mask() - Set DMA masks for the device.
* @hdev: Pointer to hl_device structure.
- * @dma_mask: number of bits for the requested dma mask.
*
* This function sets the DMA masks (regular and consistent) for a specified
* value. If it doesn't succeed, it tries to set it to a fall-back value
diff --git a/drivers/misc/habanalabs/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index 5d78d5e1c782..b3cb0ac4721c 100644
--- a/drivers/misc/habanalabs/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -9,9 +9,6 @@
#include <linux/pci.h>
-#define SET_CLK_PKT_TIMEOUT 1000000 /* 1s */
-#define SET_PWR_PKT_TIMEOUT 1000000 /* 1s */
-
long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
{
struct armcp_packet pkt;
@@ -29,7 +26,7 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
pkt.pll_index = cpu_to_le32(pll_index);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SET_CLK_PKT_TIMEOUT, &result);
+ 0, &result);
if (rc) {
dev_err(hdev->dev,
@@ -54,7 +51,7 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
pkt.value = cpu_to_le64(freq);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SET_CLK_PKT_TIMEOUT, NULL);
+ 0, NULL);
if (rc)
dev_err(hdev->dev,
@@ -74,7 +71,7 @@ u64 hl_get_max_power(struct hl_device *hdev)
ARMCP_PKT_CTL_OPCODE_SHIFT);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SET_PWR_PKT_TIMEOUT, &result);
+ 0, &result);
if (rc) {
dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
@@ -96,7 +93,7 @@ void hl_set_max_power(struct hl_device *hdev, u64 value)
pkt.value = cpu_to_le64(value);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- SET_PWR_PKT_TIMEOUT, NULL);
+ 0, NULL);
if (rc)
dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);
@@ -334,6 +331,9 @@ static ssize_t eeprom_read_handler(struct file *filp, struct kobject *kobj,
char *data;
int rc;
+ if (hl_device_disabled_or_in_reset(hdev))
+ return -ENODEV;
+
if (!max_size)
return -EINVAL;
diff --git a/drivers/misc/habanalabs/gaudi/Makefile b/drivers/misc/habanalabs/gaudi/Makefile
index f802cdc980ca..c9f4703cff24 100644
--- a/drivers/misc/habanalabs/gaudi/Makefile
+++ b/drivers/misc/habanalabs/gaudi/Makefile
@@ -1,5 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-subdir-ccflags-y += -I$(src)
-
HL_GAUDI_FILES := gaudi/gaudi.o gaudi/gaudi_hwmgr.o gaudi/gaudi_security.o \
gaudi/gaudi_coresight.o
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 61f88e9884ce..00a0a7238d81 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -6,12 +6,12 @@
*/
#include "gaudiP.h"
-#include "include/hw_ip/mmu/mmu_general.h"
-#include "include/hw_ip/mmu/mmu_v1_1.h"
-#include "include/gaudi/gaudi_masks.h"
-#include "include/gaudi/gaudi_fw_if.h"
-#include "include/gaudi/gaudi_reg_map.h"
-#include "include/gaudi/gaudi_async_ids_map_extended.h"
+#include "../include/hw_ip/mmu/mmu_general.h"
+#include "../include/hw_ip/mmu/mmu_v1_1.h"
+#include "../include/gaudi/gaudi_masks.h"
+#include "../include/gaudi/gaudi_fw_if.h"
+#include "../include/gaudi/gaudi_reg_map.h"
+#include "../include/gaudi/gaudi_async_ids_map_extended.h"
#include <linux/module.h>
#include <linux/pci.h>
@@ -21,6 +21,7 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/iommu.h>
#include <linux/seq_file.h>
+#include <linux/bitfield.h>
/*
* Gaudi security scheme:
@@ -74,12 +75,12 @@
#define GAUDI_PLDM_RESET_WAIT_MSEC 1000 /* 1s */
#define GAUDI_PLDM_HRESET_TIMEOUT_MSEC 20000 /* 20s */
-#define GAUDI_PLDM_SRESET_TIMEOUT_MSEC 14000 /* 14s */
#define GAUDI_PLDM_TEST_QUEUE_WAIT_USEC 1000000 /* 1s */
#define GAUDI_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100)
#define GAUDI_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
#define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
#define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC 1000000 /* 1s */
+#define GAUDI_MSG_TO_CPU_TIMEOUT_USEC 4000000 /* 4s */
#define GAUDI_QMAN0_FENCE_VAL 0x72E91AB9
@@ -96,7 +97,12 @@
#define GAUDI_NUM_OF_QM_ARB_ERR_CAUSE 3
-#define GAUDI_ARB_WDT_TIMEOUT 0x400000
+#define GAUDI_ARB_WDT_TIMEOUT 0x1000000
+
+#define GAUDI_CLK_GATE_DEBUGFS_MASK (\
+ BIT(GAUDI_ENGINE_ID_MME_0) |\
+ BIT(GAUDI_ENGINE_ID_MME_2) |\
+ GENMASK_ULL(GAUDI_ENGINE_ID_TPC_7, GAUDI_ENGINE_ID_TPC_0))
static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
"gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
@@ -106,14 +112,14 @@ static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
};
static const u8 gaudi_dma_assignment[GAUDI_DMA_MAX] = {
- [GAUDI_PCI_DMA_1] = 0,
- [GAUDI_PCI_DMA_2] = 1,
- [GAUDI_PCI_DMA_3] = 5,
- [GAUDI_HBM_DMA_1] = 2,
- [GAUDI_HBM_DMA_2] = 3,
- [GAUDI_HBM_DMA_3] = 4,
- [GAUDI_HBM_DMA_4] = 6,
- [GAUDI_HBM_DMA_5] = 7
+ [GAUDI_PCI_DMA_1] = GAUDI_ENGINE_ID_DMA_0,
+ [GAUDI_PCI_DMA_2] = GAUDI_ENGINE_ID_DMA_1,
+ [GAUDI_PCI_DMA_3] = GAUDI_ENGINE_ID_DMA_5,
+ [GAUDI_HBM_DMA_1] = GAUDI_ENGINE_ID_DMA_2,
+ [GAUDI_HBM_DMA_2] = GAUDI_ENGINE_ID_DMA_3,
+ [GAUDI_HBM_DMA_3] = GAUDI_ENGINE_ID_DMA_4,
+ [GAUDI_HBM_DMA_4] = GAUDI_ENGINE_ID_DMA_6,
+ [GAUDI_HBM_DMA_5] = GAUDI_ENGINE_ID_DMA_7
};
static const u8 gaudi_cq_assignment[NUMBER_OF_CMPLT_QUEUES] = {
@@ -315,6 +321,13 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = {
QUEUE_TYPE_NA, /* GAUDI_QUEUE_ID_NIC_9_3 */
};
+struct ecc_info_extract_params {
+ u64 block_address;
+ u32 num_memories;
+ bool derr;
+ bool disable_clock_gating;
+};
+
static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 asid,
u64 phys_addr);
static int gaudi_send_job_on_qman0(struct hl_device *hdev,
@@ -333,22 +346,25 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
struct asic_fixed_properties *prop = &hdev->asic_prop;
int i;
- if (GAUDI_QUEUE_ID_SIZE >= HL_MAX_QUEUES) {
- dev_err(hdev->dev,
- "Number of H/W queues must be smaller than %d\n",
- HL_MAX_QUEUES);
- return -EFAULT;
- }
+ prop->max_queues = GAUDI_QUEUE_ID_SIZE;
+ prop->hw_queues_props = kcalloc(prop->max_queues,
+ sizeof(struct hw_queue_properties),
+ GFP_KERNEL);
- for (i = 0 ; i < GAUDI_QUEUE_ID_SIZE ; i++) {
+ if (!prop->hw_queues_props)
+ return -ENOMEM;
+
+ for (i = 0 ; i < prop->max_queues ; i++) {
if (gaudi_queue_type[i] == QUEUE_TYPE_EXT) {
prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].requires_kernel_cb = 1;
+ prop->hw_queues_props[i].supports_sync_stream = 1;
} else if (gaudi_queue_type[i] == QUEUE_TYPE_CPU) {
prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
prop->hw_queues_props[i].driver_only = 1;
prop->hw_queues_props[i].requires_kernel_cb = 0;
+ prop->hw_queues_props[i].supports_sync_stream = 0;
} else if (gaudi_queue_type[i] == QUEUE_TYPE_INT) {
prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
prop->hw_queues_props[i].driver_only = 0;
@@ -357,14 +373,13 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].requires_kernel_cb = 0;
+ prop->hw_queues_props[i].supports_sync_stream = 0;
}
}
- for (; i < HL_MAX_QUEUES; i++)
- prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
-
prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
-
+ prop->sync_stream_first_sob = 0;
+ prop->sync_stream_first_mon = 0;
prop->dram_base_address = DRAM_PHYS_BASE;
prop->dram_size = GAUDI_HBM_SIZE_32GB;
prop->dram_end_address = prop->dram_base_address +
@@ -429,6 +444,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
+ prop->max_pending_cs = GAUDI_MAX_PENDING_CS;
+
return 0;
}
@@ -451,6 +468,7 @@ static int gaudi_pci_bars_map(struct hl_device *hdev)
static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr)
{
struct gaudi_device *gaudi = hdev->asic_specific;
+ struct hl_inbound_pci_region pci_region;
u64 old_addr = addr;
int rc;
@@ -458,7 +476,10 @@ static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr)
return old_addr;
/* Inbound Region 2 - Bar 4 - Point to HBM */
- rc = hl_pci_set_dram_bar_base(hdev, 2, 4, addr);
+ pci_region.mode = PCI_BAR_MATCH_MODE;
+ pci_region.bar = HBM_BAR_ID;
+ pci_region.addr = addr;
+ rc = hl_pci_set_inbound_region(hdev, 2, &pci_region);
if (rc)
return U64_MAX;
@@ -472,22 +493,43 @@ static u64 gaudi_set_hbm_bar_base(struct hl_device *hdev, u64 addr)
static int gaudi_init_iatu(struct hl_device *hdev)
{
- int rc = 0;
+ struct hl_inbound_pci_region inbound_region;
+ struct hl_outbound_pci_region outbound_region;
+ int rc;
+
+ /* Inbound Region 0 - Bar 0 - Point to SRAM + CFG */
+ inbound_region.mode = PCI_BAR_MATCH_MODE;
+ inbound_region.bar = SRAM_BAR_ID;
+ inbound_region.addr = SRAM_BASE_ADDR;
+ rc = hl_pci_set_inbound_region(hdev, 0, &inbound_region);
+ if (rc)
+ goto done;
/* Inbound Region 1 - Bar 2 - Point to SPI FLASH */
- rc = hl_pci_iatu_write(hdev, 0x314,
- lower_32_bits(SPI_FLASH_BASE_ADDR));
- rc |= hl_pci_iatu_write(hdev, 0x318,
- upper_32_bits(SPI_FLASH_BASE_ADDR));
- rc |= hl_pci_iatu_write(hdev, 0x300, 0);
- /* Enable + Bar match + match enable */
- rc |= hl_pci_iatu_write(hdev, 0x304, 0xC0080200);
+ inbound_region.mode = PCI_BAR_MATCH_MODE;
+ inbound_region.bar = CFG_BAR_ID;
+ inbound_region.addr = SPI_FLASH_BASE_ADDR;
+ rc = hl_pci_set_inbound_region(hdev, 1, &inbound_region);
+ if (rc)
+ goto done;
+ /* Inbound Region 2 - Bar 4 - Point to HBM */
+ inbound_region.mode = PCI_BAR_MATCH_MODE;
+ inbound_region.bar = HBM_BAR_ID;
+ inbound_region.addr = DRAM_PHYS_BASE;
+ rc = hl_pci_set_inbound_region(hdev, 2, &inbound_region);
if (rc)
- return -EIO;
+ goto done;
+
+ hdev->asic_funcs->set_dma_mask_from_fw(hdev);
- return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE,
- HOST_PHYS_BASE, HOST_PHYS_SIZE);
+ /* Outbound Region 0 - Point to Host */
+ outbound_region.addr = HOST_PHYS_BASE;
+ outbound_region.size = HOST_PHYS_SIZE;
+ rc = hl_pci_set_outbound_region(hdev, &outbound_region);
+
+done:
+ return rc;
}
static int gaudi_early_init(struct hl_device *hdev)
@@ -510,7 +552,8 @@ static int gaudi_early_init(struct hl_device *hdev)
(unsigned long long) pci_resource_len(pdev,
SRAM_BAR_ID),
SRAM_BAR_SIZE);
- return -ENODEV;
+ rc = -ENODEV;
+ goto free_queue_props;
}
if (pci_resource_len(pdev, CFG_BAR_ID) != CFG_BAR_SIZE) {
@@ -520,20 +563,26 @@ static int gaudi_early_init(struct hl_device *hdev)
(unsigned long long) pci_resource_len(pdev,
CFG_BAR_ID),
CFG_BAR_SIZE);
- return -ENODEV;
+ rc = -ENODEV;
+ goto free_queue_props;
}
prop->dram_pci_bar_size = pci_resource_len(pdev, HBM_BAR_ID);
rc = hl_pci_init(hdev);
if (rc)
- return rc;
+ goto free_queue_props;
return 0;
+
+free_queue_props:
+ kfree(hdev->asic_prop.hw_queues_props);
+ return rc;
}
static int gaudi_early_fini(struct hl_device *hdev)
{
+ kfree(hdev->asic_prop.hw_queues_props);
hl_pci_fini(hdev);
return 0;
@@ -548,11 +597,36 @@ static int gaudi_early_fini(struct hl_device *hdev)
static void gaudi_fetch_psoc_frequency(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 trace_freq = 0;
+ u32 pll_clk = 0;
+ u32 div_fctr = RREG32(mmPSOC_CPU_PLL_DIV_FACTOR_2);
+ u32 div_sel = RREG32(mmPSOC_CPU_PLL_DIV_SEL_2);
+ u32 nr = RREG32(mmPSOC_CPU_PLL_NR);
+ u32 nf = RREG32(mmPSOC_CPU_PLL_NF);
+ u32 od = RREG32(mmPSOC_CPU_PLL_OD);
+
+ if (div_sel == DIV_SEL_REF_CLK || div_sel == DIV_SEL_DIVIDED_REF) {
+ if (div_sel == DIV_SEL_REF_CLK)
+ trace_freq = PLL_REF_CLK;
+ else
+ trace_freq = PLL_REF_CLK / (div_fctr + 1);
+ } else if (div_sel == DIV_SEL_PLL_CLK ||
+ div_sel == DIV_SEL_DIVIDED_PLL) {
+ pll_clk = PLL_REF_CLK * (nf + 1) / ((nr + 1) * (od + 1));
+ if (div_sel == DIV_SEL_PLL_CLK)
+ trace_freq = pll_clk;
+ else
+ trace_freq = pll_clk / (div_fctr + 1);
+ } else {
+ dev_warn(hdev->dev,
+ "Received invalid div select value: %d", div_sel);
+ }
- prop->psoc_pci_pll_nr = RREG32(mmPSOC_PCI_PLL_NR);
- prop->psoc_pci_pll_nf = RREG32(mmPSOC_PCI_PLL_NF);
- prop->psoc_pci_pll_od = RREG32(mmPSOC_PCI_PLL_OD);
- prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1);
+ prop->psoc_timestamp_frequency = trace_freq;
+ prop->psoc_pci_pll_nr = nr;
+ prop->psoc_pci_pll_nf = nf;
+ prop->psoc_pci_pll_od = od;
+ prop->psoc_pci_pll_div_factor = div_fctr;
}
static int _gaudi_init_tpc_mem(struct hl_device *hdev,
@@ -567,7 +641,7 @@ static int _gaudi_init_tpc_mem(struct hl_device *hdev,
u8 tpc_id;
int rc;
- cb = hl_cb_kernel_create(hdev, PAGE_SIZE);
+ cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false);
if (!cb)
return -EFAULT;
@@ -1638,8 +1712,8 @@ static void gaudi_init_hbm_cred(struct hl_device *hdev)
uint32_t hbm0_wr, hbm1_wr, hbm0_rd, hbm1_rd;
hbm0_wr = 0x33333333;
- hbm1_wr = 0x33333333;
hbm0_rd = 0x77777777;
+ hbm1_wr = 0x55555555;
hbm1_rd = 0xDDDDDDDD;
WREG32(mmDMA_IF_E_N_HBM0_WR_CRED_CNT, hbm0_wr);
@@ -1689,125 +1763,6 @@ static void gaudi_init_hbm_cred(struct hl_device *hdev)
(1 << DMA_IF_HBM_CRED_EN_WRITE_CREDIT_EN_SHIFT));
}
-static void gaudi_init_rate_limiter(struct hl_device *hdev)
-{
- u32 nr, nf, od, sat, rst, timeout;
- u64 freq;
-
- nr = RREG32(mmPSOC_HBM_PLL_NR);
- nf = RREG32(mmPSOC_HBM_PLL_NF);
- od = RREG32(mmPSOC_HBM_PLL_OD);
- freq = (50 * (nf + 1)) / ((nr + 1) * (od + 1));
-
- dev_dbg(hdev->dev, "HBM frequency is %lluMHz\n", freq);
-
- /* Configuration is for five (5) DDMA channels */
- if (freq == 800) {
- sat = 4;
- rst = 11;
- timeout = 15;
- } else if (freq == 900) {
- sat = 4;
- rst = 15;
- timeout = 16;
- } else if (freq == 950) {
- sat = 4;
- rst = 15;
- timeout = 15;
- } else {
- dev_warn(hdev->dev,
- "unsupported HBM frequency %lluMHz, no rate-limiters\n",
- freq);
- return;
- }
-
- WREG32(mmDMA_IF_W_S_DOWN_RSP_MID_WGHT_0, 0x111);
- WREG32(mmDMA_IF_W_S_DOWN_RSP_MID_WGHT_1, 0x111);
- WREG32(mmDMA_IF_E_S_DOWN_RSP_MID_WGHT_0, 0x111);
- WREG32(mmDMA_IF_E_S_DOWN_RSP_MID_WGHT_1, 0x111);
- WREG32(mmDMA_IF_W_N_DOWN_RSP_MID_WGHT_0, 0x111);
- WREG32(mmDMA_IF_W_N_DOWN_RSP_MID_WGHT_1, 0x111);
- WREG32(mmDMA_IF_E_N_DOWN_RSP_MID_WGHT_0, 0x111);
- WREG32(mmDMA_IF_E_N_DOWN_RSP_MID_WGHT_1, 0x111);
-
- if (!hdev->rl_enable) {
- dev_info(hdev->dev, "Rate limiters disabled\n");
- return;
- }
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_SAT, sat);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_SAT, sat);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_RST, rst);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_RST, rst);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_TIMEOUT, timeout);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_HBM_EN, 1);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_HBM_EN, 1);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_SAT, sat);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_SAT, sat);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_RST, rst);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_RST, rst);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_TIMEOUT, timeout);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_TIMEOUT, timeout);
-
- WREG32(mmDMA_IF_W_S_DOWN_CH0_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_W_S_DOWN_CH1_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_E_S_DOWN_CH0_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_E_S_DOWN_CH1_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_W_N_DOWN_CH0_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_W_N_DOWN_CH1_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_E_N_DOWN_CH0_RL_SRAM_EN, 1);
- WREG32(mmDMA_IF_E_N_DOWN_CH1_RL_SRAM_EN, 1);
-}
-
static void gaudi_init_golden_registers(struct hl_device *hdev)
{
u32 tpc_offset;
@@ -1817,9 +1772,7 @@ static void gaudi_init_golden_registers(struct hl_device *hdev)
gaudi_init_hbm_cred(hdev);
- gaudi_init_rate_limiter(hdev);
-
- gaudi_disable_clock_gating(hdev);
+ hdev->asic_funcs->disable_clock_gating(hdev);
for (tpc_id = 0, tpc_offset = 0;
tpc_id < TPC_NUMBER_OF_ENGINES;
@@ -1839,9 +1792,6 @@ static void gaudi_init_golden_registers(struct hl_device *hdev)
WREG32(mmMME1_CTRL_EUS_ROLLUP_CNT_ADD, 3);
WREG32(mmMME2_CTRL_EUS_ROLLUP_CNT_ADD, 3);
WREG32(mmMME3_CTRL_EUS_ROLLUP_CNT_ADD, 3);
-
- /* WA for H3-2081 */
- WREG32(mmPCIE_WRAP_MAX_OUTSTAND, 0x10ff);
}
static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id,
@@ -1893,6 +1843,8 @@ static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id,
WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_LO_0 + q_off, so_base_ws_lo);
WREG32(mmDMA0_QM_CP_MSG_BASE3_ADDR_HI_0 + q_off, so_base_ws_hi);
+ WREG32(mmDMA0_QM_CP_BARRIER_CFG_0 + q_off, 0x100);
+
/* The following configuration is needed only once per QMAN */
if (qman_id == 0) {
/* Configure RAZWI IRQ */
@@ -2529,46 +2481,55 @@ static void gaudi_tpc_stall(struct hl_device *hdev)
WREG32(mmTPC7_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT);
}
-static void gaudi_enable_clock_gating(struct hl_device *hdev)
+static void gaudi_set_clock_gating(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
u32 qman_offset;
int i;
- if (!hdev->clock_gating)
- return;
-
- if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE)
- return;
-
/* In case we are during debug session, don't enable the clock gate
* as it may interfere
*/
if (hdev->in_debug)
return;
- for (i = 0, qman_offset = 0 ; i < PCI_DMA_NUMBER_OF_CHNLS ; i++) {
+ for (i = GAUDI_PCI_DMA_1, qman_offset = 0 ; i < GAUDI_HBM_DMA_1 ; i++) {
+ if (!(hdev->clock_gating_mask &
+ (BIT_ULL(gaudi_dma_assignment[i]))))
+ continue;
+
qman_offset = gaudi_dma_assignment[i] * DMA_QMAN_OFFSET;
WREG32(mmDMA0_QM_CGM_CFG1 + qman_offset, QMAN_CGM1_PWR_GATE_EN);
WREG32(mmDMA0_QM_CGM_CFG + qman_offset,
QMAN_UPPER_CP_CGM_PWR_GATE_EN);
}
- for (; i < HBM_DMA_NUMBER_OF_CHNLS ; i++) {
+ for (i = GAUDI_HBM_DMA_1 ; i < GAUDI_DMA_MAX ; i++) {
+ if (!(hdev->clock_gating_mask &
+ (BIT_ULL(gaudi_dma_assignment[i]))))
+ continue;
+
qman_offset = gaudi_dma_assignment[i] * DMA_QMAN_OFFSET;
WREG32(mmDMA0_QM_CGM_CFG1 + qman_offset, QMAN_CGM1_PWR_GATE_EN);
WREG32(mmDMA0_QM_CGM_CFG + qman_offset,
QMAN_COMMON_CP_CGM_PWR_GATE_EN);
}
- WREG32(mmMME0_QM_CGM_CFG1, QMAN_CGM1_PWR_GATE_EN);
- WREG32(mmMME0_QM_CGM_CFG,
- QMAN_COMMON_CP_CGM_PWR_GATE_EN);
- WREG32(mmMME2_QM_CGM_CFG1, QMAN_CGM1_PWR_GATE_EN);
- WREG32(mmMME2_QM_CGM_CFG,
- QMAN_COMMON_CP_CGM_PWR_GATE_EN);
+ if (hdev->clock_gating_mask & (BIT_ULL(GAUDI_ENGINE_ID_MME_0))) {
+ WREG32(mmMME0_QM_CGM_CFG1, QMAN_CGM1_PWR_GATE_EN);
+ WREG32(mmMME0_QM_CGM_CFG, QMAN_COMMON_CP_CGM_PWR_GATE_EN);
+ }
+
+ if (hdev->clock_gating_mask & (BIT_ULL(GAUDI_ENGINE_ID_MME_2))) {
+ WREG32(mmMME2_QM_CGM_CFG1, QMAN_CGM1_PWR_GATE_EN);
+ WREG32(mmMME2_QM_CGM_CFG, QMAN_COMMON_CP_CGM_PWR_GATE_EN);
+ }
for (i = 0, qman_offset = 0 ; i < TPC_NUMBER_OF_ENGINES ; i++) {
+ if (!(hdev->clock_gating_mask &
+ (BIT_ULL(GAUDI_ENGINE_ID_TPC_0 + i))))
+ continue;
+
WREG32(mmTPC0_QM_CGM_CFG1 + qman_offset,
QMAN_CGM1_PWR_GATE_EN);
WREG32(mmTPC0_QM_CGM_CFG + qman_offset,
@@ -2632,36 +2593,23 @@ static void gaudi_disable_timestamp(struct hl_device *hdev)
static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
{
- u32 wait_timeout_ms, cpu_timeout_ms;
+ u32 wait_timeout_ms;
dev_info(hdev->dev,
"Halting compute engines and disabling interrupts\n");
- if (hdev->pldm) {
+ if (hdev->pldm)
wait_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC;
- cpu_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC;
- } else {
+ else
wait_timeout_ms = GAUDI_RESET_WAIT_MSEC;
- cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC;
- }
- if (hard_reset) {
- /*
- * I don't know what is the state of the CPU so make sure it is
- * stopped in any means necessary
- */
- WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE);
- WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
- GAUDI_EVENT_HALT_MACHINE);
- msleep(cpu_timeout_ms);
- }
gaudi_stop_mme_qmans(hdev);
gaudi_stop_tpc_qmans(hdev);
gaudi_stop_hbm_dma_qmans(hdev);
gaudi_stop_pci_dma_qmans(hdev);
- gaudi_disable_clock_gating(hdev);
+ hdev->asic_funcs->disable_clock_gating(hdev);
msleep(wait_timeout_ms);
@@ -2679,10 +2627,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
gaudi_disable_timestamp(hdev);
- if (hard_reset)
- gaudi_disable_msi(hdev);
- else
- gaudi_sync_irqs(hdev);
+ gaudi_disable_msi(hdev);
}
static int gaudi_mmu_init(struct hl_device *hdev)
@@ -2716,8 +2661,7 @@ static int gaudi_mmu_init(struct hl_device *hdev)
WREG32(mmSTLB_CACHE_INV_BASE_39_8, MMU_CACHE_MNG_ADDR >> 8);
WREG32(mmSTLB_CACHE_INV_BASE_49_40, MMU_CACHE_MNG_ADDR >> 40);
- hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
- VM_TYPE_USERPTR | VM_TYPE_PHYS_PACK);
+ hdev->asic_funcs->mmu_invalidate_cache(hdev, true, 0);
WREG32(mmMMU_UP_MMU_ENABLE, 1);
WREG32(mmMMU_UP_SPI_MASK, 0xF);
@@ -2725,6 +2669,12 @@ static int gaudi_mmu_init(struct hl_device *hdev)
WREG32(mmSTLB_HOP_CONFIGURATION,
hdev->mmu_huge_page_opt ? 0x30440 : 0x40440);
+ /*
+ * The H/W expects the first PI after init to be 1. After wraparound
+ * we'll write 0.
+ */
+ gaudi->mmu_cache_inv_pi = 1;
+
gaudi->hw_cap_initialized |= HW_CAP_MMU;
return 0;
@@ -2961,16 +2911,6 @@ static int gaudi_hw_init(struct hl_device *hdev)
gaudi_init_hbm_dma_qmans(hdev);
- /*
- * Before pushing u-boot/linux to device, need to set the hbm bar to
- * base address of dram
- */
- if (gaudi_set_hbm_bar_base(hdev, DRAM_PHYS_BASE) == U64_MAX) {
- dev_err(hdev->dev,
- "failed to map HBM bar to DRAM base address\n");
- return -EIO;
- }
-
rc = gaudi_init_cpu(hdev);
if (rc) {
dev_err(hdev->dev, "failed to initialize CPU\n");
@@ -2995,7 +2935,7 @@ static int gaudi_hw_init(struct hl_device *hdev)
gaudi_init_tpc_qmans(hdev);
- gaudi_enable_clock_gating(hdev);
+ hdev->asic_funcs->set_clock_gating(hdev);
gaudi_enable_timestamp(hdev);
@@ -3029,48 +2969,56 @@ disable_queues:
static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- u32 status, reset_timeout_ms, boot_strap = 0;
+ u32 status, reset_timeout_ms, cpu_timeout_ms, boot_strap = 0;
+
+ if (!hard_reset) {
+ dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n");
+ return;
+ }
if (hdev->pldm) {
- if (hard_reset)
- reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC;
- else
- reset_timeout_ms = GAUDI_PLDM_SRESET_TIMEOUT_MSEC;
+ reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC;
+ cpu_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC;
} else {
reset_timeout_ms = GAUDI_RESET_TIMEOUT_MSEC;
+ cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC;
}
- if (hard_reset) {
- /* Tell ASIC not to re-initialize PCIe */
- WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC);
+ /* Set device to handle FLR by H/W as we will put the device CPU to
+ * halt mode
+ */
+ WREG32(mmPCIE_AUX_FLR_CTRL, (PCIE_AUX_FLR_CTRL_HW_CTRL_MASK |
+ PCIE_AUX_FLR_CTRL_INT_MASK_MASK));
- boot_strap = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS);
- /* H/W bug WA:
- * rdata[31:0] = strap_read_val;
- * wdata[31:0] = rdata[30:21],1'b0,rdata[20:0]
- */
- boot_strap = (((boot_strap & 0x7FE00000) << 1) |
- (boot_strap & 0x001FFFFF));
- WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap & ~0x2);
-
- /* Restart BTL/BLR upon hard-reset */
- WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 1);
-
- WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST,
- 1 << PSOC_GLOBAL_CONF_SW_ALL_RST_IND_SHIFT);
- dev_info(hdev->dev,
- "Issued HARD reset command, going to wait %dms\n",
- reset_timeout_ms);
- } else {
- /* Don't restart BTL/BLR upon soft-reset */
- WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 0);
+ /* I don't know what is the state of the CPU so make sure it is
+ * stopped in any means necessary
+ */
+ WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE);
+ WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, GAUDI_EVENT_HALT_MACHINE);
- WREG32(mmPSOC_GLOBAL_CONF_SOFT_RST,
- 1 << PSOC_GLOBAL_CONF_SOFT_RST_IND_SHIFT);
- dev_info(hdev->dev,
- "Issued SOFT reset command, going to wait %dms\n",
- reset_timeout_ms);
- }
+ msleep(cpu_timeout_ms);
+
+ /* Tell ASIC not to re-initialize PCIe */
+ WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC);
+
+ boot_strap = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS);
+
+ /* H/W bug WA:
+ * rdata[31:0] = strap_read_val;
+ * wdata[31:0] = rdata[30:21],1'b0,rdata[20:0]
+ */
+ boot_strap = (((boot_strap & 0x7FE00000) << 1) |
+ (boot_strap & 0x001FFFFF));
+ WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap & ~0x2);
+
+ /* Restart BTL/BLR upon hard-reset */
+ WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START, 1);
+
+ WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST,
+ 1 << PSOC_GLOBAL_CONF_SW_ALL_RST_IND_SHIFT);
+ dev_info(hdev->dev,
+ "Issued HARD reset command, going to wait %dms\n",
+ reset_timeout_ms);
/*
* After hard reset, we can't poll the BTM_FSM register because the PSOC
@@ -3084,18 +3032,6 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
"Timeout while waiting for device to reset 0x%x\n",
status);
- if (!hard_reset) {
- gaudi->hw_cap_initialized &= ~(HW_CAP_PCI_DMA | HW_CAP_MME |
- HW_CAP_TPC_MASK |
- HW_CAP_HBM_DMA);
-
- WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
- GAUDI_EVENT_SOFT_RESET);
- return;
- }
-
- /* We continue here only for hard-reset */
-
WREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS, boot_strap);
gaudi->hw_cap_initialized &= ~(HW_CAP_CPU | HW_CAP_CPU_Q |
@@ -3104,7 +3040,9 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
HW_CAP_HBM_DMA | HW_CAP_PLL |
HW_CAP_MMU |
HW_CAP_SRAM_SCRAMBLER |
- HW_CAP_HBM_SCRAMBLER);
+ HW_CAP_HBM_SCRAMBLER |
+ HW_CAP_CLK_GATE);
+
memset(gaudi->events_stat, 0, sizeof(gaudi->events_stat));
}
@@ -3455,6 +3393,9 @@ static int gaudi_send_cpu_message(struct hl_device *hdev, u32 *msg,
return 0;
}
+ if (!timeout)
+ timeout = GAUDI_MSG_TO_CPU_TIMEOUT_USEC;
+
return hl_fw_send_cpu_message(hdev, GAUDI_QUEUE_ID_CPU_PQ, msg, len,
timeout, result);
}
@@ -3550,7 +3491,7 @@ static int gaudi_test_queues(struct hl_device *hdev)
{
int i, rc, ret_val = 0;
- for (i = 0 ; i < HL_MAX_QUEUES ; i++) {
+ for (i = 0 ; i < hdev->asic_prop.max_queues ; i++) {
if (hdev->asic_prop.hw_queues_props[i].type == QUEUE_TYPE_EXT) {
rc = gaudi_test_queue(hdev, i);
if (rc)
@@ -3790,6 +3731,25 @@ static int gaudi_validate_dma_pkt_no_mmu(struct hl_device *hdev,
src_in_host);
}
+static int gaudi_validate_load_and_exe_pkt(struct hl_device *hdev,
+ struct hl_cs_parser *parser,
+ struct packet_load_and_exe *user_pkt)
+{
+ u32 cfg;
+
+ cfg = le32_to_cpu(user_pkt->cfg);
+
+ if (cfg & GAUDI_PKT_LOAD_AND_EXE_CFG_DST_MASK) {
+ dev_err(hdev->dev,
+ "User not allowed to use Load and Execute\n");
+ return -EPERM;
+ }
+
+ parser->patched_cb_size += sizeof(struct packet_load_and_exe);
+
+ return 0;
+}
+
static int gaudi_validate_cb(struct hl_device *hdev,
struct hl_cs_parser *parser, bool is_mmu)
{
@@ -3838,6 +3798,17 @@ static int gaudi_validate_cb(struct hl_device *hdev,
rc = -EPERM;
break;
+ case PACKET_WREG_BULK:
+ dev_err(hdev->dev,
+ "User not allowed to use WREG_BULK\n");
+ rc = -EPERM;
+ break;
+
+ case PACKET_LOAD_AND_EXE:
+ rc = gaudi_validate_load_and_exe_pkt(hdev, parser,
+ (struct packet_load_and_exe *) user_pkt);
+ break;
+
case PACKET_LIN_DMA:
parser->contains_dma_pkt = true;
if (is_mmu)
@@ -3848,14 +3819,12 @@ static int gaudi_validate_cb(struct hl_device *hdev,
break;
case PACKET_WREG_32:
- case PACKET_WREG_BULK:
case PACKET_MSG_LONG:
case PACKET_MSG_SHORT:
case PACKET_REPEAT:
case PACKET_FENCE:
case PACKET_NOP:
case PACKET_ARB_POINT:
- case PACKET_LOAD_AND_EXE:
parser->patched_cb_size += pkt_size;
break;
@@ -4103,9 +4072,8 @@ static int gaudi_parse_cb_mmu(struct hl_device *hdev,
parser->patched_cb_size = parser->user_cb_size +
sizeof(struct packet_msg_prot) * 2;
- rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
- parser->patched_cb_size,
- &patched_cb_handle, HL_KERNEL_ASID_ID);
+ rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
+ &patched_cb_handle, HL_KERNEL_ASID_ID, false);
if (rc) {
dev_err(hdev->dev,
@@ -4177,9 +4145,8 @@ static int gaudi_parse_cb_no_mmu(struct hl_device *hdev,
if (rc)
goto free_userptr;
- rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
- parser->patched_cb_size,
- &patched_cb_handle, HL_KERNEL_ASID_ID);
+ rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
+ &patched_cb_handle, HL_KERNEL_ASID_ID, false);
if (rc) {
dev_err(hdev->dev,
"Failed to allocate patched CB for DMA CS %d\n", rc);
@@ -4308,11 +4275,11 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
{
struct packet_lin_dma *lin_dma_pkt;
struct hl_cs_job *job;
- u32 cb_size, ctl;
+ u32 cb_size, ctl, err_cause;
struct hl_cb *cb;
int rc;
- cb = hl_cb_kernel_create(hdev, PAGE_SIZE);
+ cb = hl_cb_kernel_create(hdev, PAGE_SIZE, false);
if (!cb)
return -EFAULT;
@@ -4337,6 +4304,15 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
goto release_cb;
}
+ /* Verify DMA is OK */
+ err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE);
+ if (err_cause && !hdev->init_done) {
+ dev_dbg(hdev->dev,
+ "Clearing DMA0 engine from errors (cause 0x%x)\n",
+ err_cause);
+ WREG32(mmDMA0_CORE_ERR_CAUSE, err_cause);
+ }
+
job->id = 0;
job->user_cb = cb;
job->user_cb->cs_cnt++;
@@ -4348,11 +4324,23 @@ static int gaudi_memset_device_memory(struct hl_device *hdev, u64 addr,
hl_debugfs_add_job(hdev, job);
rc = gaudi_send_job_on_qman0(hdev, job);
-
hl_debugfs_remove_job(hdev, job);
kfree(job);
cb->cs_cnt--;
+ /* Verify DMA is OK */
+ err_cause = RREG32(mmDMA0_CORE_ERR_CAUSE);
+ if (err_cause) {
+ dev_err(hdev->dev, "DMA Failed, cause 0x%x\n", err_cause);
+ rc = -EIO;
+ if (!hdev->init_done) {
+ dev_dbg(hdev->dev,
+ "Clearing DMA0 engine from errors (cause 0x%x)\n",
+ err_cause);
+ WREG32(mmDMA0_CORE_ERR_CAUSE, err_cause);
+ }
+ }
+
release_cb:
hl_cb_put(cb);
hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
@@ -4490,13 +4478,18 @@ static int gaudi_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val)
int rc = 0;
if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) {
- if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) {
+
+ if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
+ (hdev->clock_gating_mask &
+ GAUDI_CLK_GATE_DEBUGFS_MASK)) {
+
dev_err_ratelimited(hdev->dev,
"Can't read register - clock gating is enabled!\n");
rc = -EFAULT;
} else {
*val = RREG32(addr - CFG_BASE);
}
+
} else if ((addr >= SRAM_BASE_ADDR) &&
(addr < SRAM_BASE_ADDR + SRAM_BAR_SIZE)) {
*val = readl(hdev->pcie_bar[SRAM_BAR_ID] +
@@ -4532,13 +4525,18 @@ static int gaudi_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
int rc = 0;
if ((addr >= CFG_BASE) && (addr < CFG_BASE + CFG_SIZE)) {
- if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) {
+
+ if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
+ (hdev->clock_gating_mask &
+ GAUDI_CLK_GATE_DEBUGFS_MASK)) {
+
dev_err_ratelimited(hdev->dev,
"Can't write register - clock gating is enabled!\n");
rc = -EFAULT;
} else {
WREG32(addr - CFG_BASE, val);
}
+
} else if ((addr >= SRAM_BASE_ADDR) &&
(addr < SRAM_BASE_ADDR + SRAM_BAR_SIZE)) {
writel(val, hdev->pcie_bar[SRAM_BAR_ID] +
@@ -4574,7 +4572,11 @@ static int gaudi_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)
int rc = 0;
if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
- if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) {
+
+ if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
+ (hdev->clock_gating_mask &
+ GAUDI_CLK_GATE_DEBUGFS_MASK)) {
+
dev_err_ratelimited(hdev->dev,
"Can't read register - clock gating is enabled!\n");
rc = -EFAULT;
@@ -4584,6 +4586,7 @@ static int gaudi_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)
*val = (((u64) val_h) << 32) | val_l;
}
+
} else if ((addr >= SRAM_BASE_ADDR) &&
(addr <= SRAM_BASE_ADDR + SRAM_BAR_SIZE - sizeof(u64))) {
*val = readq(hdev->pcie_bar[SRAM_BAR_ID] +
@@ -4620,7 +4623,11 @@ static int gaudi_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
int rc = 0;
if ((addr >= CFG_BASE) && (addr <= CFG_BASE + CFG_SIZE - sizeof(u64))) {
- if (gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) {
+
+ if ((gaudi->hw_cap_initialized & HW_CAP_CLK_GATE) &&
+ (hdev->clock_gating_mask &
+ GAUDI_CLK_GATE_DEBUGFS_MASK)) {
+
dev_err_ratelimited(hdev->dev,
"Can't write register - clock gating is enabled!\n");
rc = -EFAULT;
@@ -4629,6 +4636,7 @@ static int gaudi_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
WREG32(addr + sizeof(u32) - CFG_BASE,
upper_32_bits(val));
}
+
} else if ((addr >= SRAM_BASE_ADDR) &&
(addr <= SRAM_BASE_ADDR + SRAM_BAR_SIZE - sizeof(u64))) {
writeq(val, hdev->pcie_bar[SRAM_BAR_ID] +
@@ -4850,7 +4858,7 @@ static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid)
gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_ARUSER, asid);
gaudi_mmu_prepare_reg(hdev, mmPSOC_GLOBAL_CONF_TRACE_AWUSER, asid);
- hdev->asic_funcs->enable_clock_gating(hdev);
+ hdev->asic_funcs->set_clock_gating(hdev);
mutex_unlock(&gaudi->clk_gate_mutex);
}
@@ -5178,62 +5186,76 @@ static void gaudi_print_mmu_error_info(struct hl_device *hdev)
* | |0xF4C memory wrappers 127:96 |
* +-------------------+------------------------------------------------------+
*/
-static void gaudi_print_ecc_info_generic(struct hl_device *hdev,
- const char *block_name,
- u64 block_address, int num_memories,
- bool derr, bool disable_clock_gating)
+static int gaudi_extract_ecc_info(struct hl_device *hdev,
+ struct ecc_info_extract_params *params, u64 *ecc_address,
+ u64 *ecc_syndrom, u8 *memory_wrapper_idx)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- int num_mem_regs = num_memories / 32 + ((num_memories % 32) ? 1 : 0);
+ u32 i, num_mem_regs, reg, err_bit;
+ u64 err_addr, err_word = 0;
+ int rc = 0;
- if (block_address >= CFG_BASE)
- block_address -= CFG_BASE;
+ num_mem_regs = params->num_memories / 32 +
+ ((params->num_memories % 32) ? 1 : 0);
- if (derr)
- block_address += GAUDI_ECC_DERR0_OFFSET;
+ if (params->block_address >= CFG_BASE)
+ params->block_address -= CFG_BASE;
+
+ if (params->derr)
+ err_addr = params->block_address + GAUDI_ECC_DERR0_OFFSET;
else
- block_address += GAUDI_ECC_SERR0_OFFSET;
+ err_addr = params->block_address + GAUDI_ECC_SERR0_OFFSET;
- if (disable_clock_gating) {
+ if (params->disable_clock_gating) {
mutex_lock(&gaudi->clk_gate_mutex);
hdev->asic_funcs->disable_clock_gating(hdev);
}
- switch (num_mem_regs) {
- case 1:
- dev_err(hdev->dev,
- "%s ECC indication: 0x%08x\n",
- block_name, RREG32(block_address));
- break;
- case 2:
- dev_err(hdev->dev,
- "%s ECC indication: 0x%08x 0x%08x\n",
- block_name,
- RREG32(block_address), RREG32(block_address + 4));
- break;
- case 3:
- dev_err(hdev->dev,
- "%s ECC indication: 0x%08x 0x%08x 0x%08x\n",
- block_name,
- RREG32(block_address), RREG32(block_address + 4),
- RREG32(block_address + 8));
- break;
- case 4:
- dev_err(hdev->dev,
- "%s ECC indication: 0x%08x 0x%08x 0x%08x 0x%08x\n",
- block_name,
- RREG32(block_address), RREG32(block_address + 4),
- RREG32(block_address + 8), RREG32(block_address + 0xc));
- break;
- default:
- break;
+ /* Set invalid wrapper index */
+ *memory_wrapper_idx = 0xFF;
+ /* Iterate through memory wrappers, a single bit must be set */
+ for (i = 0 ; i > num_mem_regs ; i++) {
+ err_addr += i * 4;
+ err_word = RREG32(err_addr);
+ if (err_word) {
+ err_bit = __ffs(err_word);
+ *memory_wrapper_idx = err_bit + (32 * i);
+ break;
+ }
}
- if (disable_clock_gating) {
- hdev->asic_funcs->enable_clock_gating(hdev);
+ if (*memory_wrapper_idx == 0xFF) {
+ dev_err(hdev->dev, "ECC error information cannot be found\n");
+ rc = -EINVAL;
+ goto enable_clk_gate;
+ }
+
+ WREG32(params->block_address + GAUDI_ECC_MEM_SEL_OFFSET,
+ *memory_wrapper_idx);
+
+ *ecc_address =
+ RREG32(params->block_address + GAUDI_ECC_ADDRESS_OFFSET);
+ *ecc_syndrom =
+ RREG32(params->block_address + GAUDI_ECC_SYNDROME_OFFSET);
+
+ /* Clear error indication */
+ reg = RREG32(params->block_address + GAUDI_ECC_MEM_INFO_CLR_OFFSET);
+ if (params->derr)
+ reg |= FIELD_PREP(GAUDI_ECC_MEM_INFO_CLR_DERR_MASK, 1);
+ else
+ reg |= FIELD_PREP(GAUDI_ECC_MEM_INFO_CLR_SERR_MASK, 1);
+
+ WREG32(params->block_address + GAUDI_ECC_MEM_INFO_CLR_OFFSET, reg);
+
+enable_clk_gate:
+ if (params->disable_clock_gating) {
+ hdev->asic_funcs->set_clock_gating(hdev);
+
mutex_unlock(&gaudi->clk_gate_mutex);
}
+
+ return rc;
}
static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
@@ -5286,239 +5308,99 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev,
}
}
-static void gaudi_print_ecc_info(struct hl_device *hdev, u16 event_type)
+static void gaudi_handle_ecc_event(struct hl_device *hdev, u16 event_type,
+ struct hl_eq_ecc_data *ecc_data)
{
- u64 block_address;
- u8 index;
- int num_memories;
- char desc[32];
- bool derr;
- bool disable_clock_gating;
+ struct ecc_info_extract_params params;
+ u64 ecc_address = 0, ecc_syndrom = 0;
+ u8 index, memory_wrapper_idx = 0;
+ bool extract_info_from_fw;
+ int rc;
switch (event_type) {
- case GAUDI_EVENT_PCIE_CORE_SERR:
- snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_CORE");
- block_address = mmPCIE_CORE_BASE;
- num_memories = 51;
- derr = false;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_PCIE_CORE_DERR:
- snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_CORE");
- block_address = mmPCIE_CORE_BASE;
- num_memories = 51;
- derr = true;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_PCIE_IF_SERR:
- snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_WRAP");
- block_address = mmPCIE_WRAP_BASE;
- num_memories = 11;
- derr = false;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_PCIE_IF_DERR:
- snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_WRAP");
- block_address = mmPCIE_WRAP_BASE;
- num_memories = 11;
- derr = true;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_PCIE_PHY_SERR:
- snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_PHY");
- block_address = mmPCIE_PHY_BASE;
- num_memories = 4;
- derr = false;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_PCIE_PHY_DERR:
- snprintf(desc, ARRAY_SIZE(desc), "%s", "PCIE_PHY");
- block_address = mmPCIE_PHY_BASE;
- num_memories = 4;
- derr = true;
- disable_clock_gating = false;
+ case GAUDI_EVENT_PCIE_CORE_SERR ... GAUDI_EVENT_PCIE_PHY_DERR:
+ case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_MMU_DERR:
+ extract_info_from_fw = true;
break;
case GAUDI_EVENT_TPC0_SERR ... GAUDI_EVENT_TPC7_SERR:
index = event_type - GAUDI_EVENT_TPC0_SERR;
- block_address = mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "%s%d", "TPC", index);
- num_memories = 90;
- derr = false;
- disable_clock_gating = true;
+ params.block_address = mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET;
+ params.num_memories = 90;
+ params.derr = false;
+ params.disable_clock_gating = true;
+ extract_info_from_fw = false;
break;
case GAUDI_EVENT_TPC0_DERR ... GAUDI_EVENT_TPC7_DERR:
index = event_type - GAUDI_EVENT_TPC0_DERR;
- block_address =
+ params.block_address =
mmTPC0_CFG_BASE + index * TPC_CFG_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "%s%d", "TPC", index);
- num_memories = 90;
- derr = true;
- disable_clock_gating = true;
+ params.num_memories = 90;
+ params.derr = true;
+ params.disable_clock_gating = true;
+ extract_info_from_fw = false;
break;
case GAUDI_EVENT_MME0_ACC_SERR:
case GAUDI_EVENT_MME1_ACC_SERR:
case GAUDI_EVENT_MME2_ACC_SERR:
case GAUDI_EVENT_MME3_ACC_SERR:
index = (event_type - GAUDI_EVENT_MME0_ACC_SERR) / 4;
- block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "MME%d_ACC", index);
- num_memories = 128;
- derr = false;
- disable_clock_gating = true;
+ params.block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET;
+ params.num_memories = 128;
+ params.derr = false;
+ params.disable_clock_gating = true;
+ extract_info_from_fw = false;
break;
case GAUDI_EVENT_MME0_ACC_DERR:
case GAUDI_EVENT_MME1_ACC_DERR:
case GAUDI_EVENT_MME2_ACC_DERR:
case GAUDI_EVENT_MME3_ACC_DERR:
index = (event_type - GAUDI_EVENT_MME0_ACC_DERR) / 4;
- block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "MME%d_ACC", index);
- num_memories = 128;
- derr = true;
- disable_clock_gating = true;
+ params.block_address = mmMME0_ACC_BASE + index * MME_ACC_OFFSET;
+ params.num_memories = 128;
+ params.derr = true;
+ params.disable_clock_gating = true;
+ extract_info_from_fw = false;
break;
case GAUDI_EVENT_MME0_SBAB_SERR:
case GAUDI_EVENT_MME1_SBAB_SERR:
case GAUDI_EVENT_MME2_SBAB_SERR:
case GAUDI_EVENT_MME3_SBAB_SERR:
index = (event_type - GAUDI_EVENT_MME0_SBAB_SERR) / 4;
- block_address = mmMME0_SBAB_BASE + index * MME_ACC_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "MME%d_SBAB", index);
- num_memories = 33;
- derr = false;
- disable_clock_gating = true;
+ params.block_address =
+ mmMME0_SBAB_BASE + index * MME_ACC_OFFSET;
+ params.num_memories = 33;
+ params.derr = false;
+ params.disable_clock_gating = true;
+ extract_info_from_fw = false;
break;
case GAUDI_EVENT_MME0_SBAB_DERR:
case GAUDI_EVENT_MME1_SBAB_DERR:
case GAUDI_EVENT_MME2_SBAB_DERR:
case GAUDI_EVENT_MME3_SBAB_DERR:
index = (event_type - GAUDI_EVENT_MME0_SBAB_DERR) / 4;
- block_address = mmMME0_SBAB_BASE + index * MME_ACC_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "MME%d_SBAB", index);
- num_memories = 33;
- derr = true;
- disable_clock_gating = true;
- break;
- case GAUDI_EVENT_DMA0_SERR_ECC ... GAUDI_EVENT_DMA7_SERR_ECC:
- index = event_type - GAUDI_EVENT_DMA0_SERR_ECC;
- block_address = mmDMA0_CORE_BASE + index * DMA_CORE_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "DMA%d_CORE", index);
- num_memories = 16;
- derr = false;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_DMA0_DERR_ECC ... GAUDI_EVENT_DMA7_DERR_ECC:
- index = event_type - GAUDI_EVENT_DMA0_DERR_ECC;
- block_address = mmDMA0_CORE_BASE + index * DMA_CORE_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "DMA%d_CORE", index);
- num_memories = 16;
- derr = true;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_CPU_IF_ECC_SERR:
- block_address = mmCPU_IF_BASE;
- snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU");
- num_memories = 4;
- derr = false;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_CPU_IF_ECC_DERR:
- block_address = mmCPU_IF_BASE;
- snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU");
- num_memories = 4;
- derr = true;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_PSOC_MEM_SERR:
- block_address = mmPSOC_GLOBAL_CONF_BASE;
- snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU");
- num_memories = 4;
- derr = false;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_PSOC_MEM_DERR:
- block_address = mmPSOC_GLOBAL_CONF_BASE;
- snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU");
- num_memories = 4;
- derr = true;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_PSOC_CORESIGHT_SERR:
- block_address = mmPSOC_CS_TRACE_BASE;
- snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU");
- num_memories = 2;
- derr = false;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_PSOC_CORESIGHT_DERR:
- block_address = mmPSOC_CS_TRACE_BASE;
- snprintf(desc, ARRAY_SIZE(desc), "%s", "CPU");
- num_memories = 2;
- derr = true;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_SRAM0_SERR ... GAUDI_EVENT_SRAM28_SERR:
- index = event_type - GAUDI_EVENT_SRAM0_SERR;
- block_address =
- mmSRAM_Y0_X0_BANK_BASE + index * SRAM_BANK_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "SRAM%d", index);
- num_memories = 2;
- derr = false;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_SRAM0_DERR ... GAUDI_EVENT_SRAM28_DERR:
- index = event_type - GAUDI_EVENT_SRAM0_DERR;
- block_address =
- mmSRAM_Y0_X0_BANK_BASE + index * SRAM_BANK_OFFSET;
- snprintf(desc, ARRAY_SIZE(desc), "SRAM%d", index);
- num_memories = 2;
- derr = true;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_DMA_IF0_SERR ... GAUDI_EVENT_DMA_IF3_SERR:
- index = event_type - GAUDI_EVENT_DMA_IF0_SERR;
- block_address = mmDMA_IF_W_S_BASE +
- index * (mmDMA_IF_E_S_BASE - mmDMA_IF_W_S_BASE);
- snprintf(desc, ARRAY_SIZE(desc), "DMA_IF%d", index);
- num_memories = 60;
- derr = false;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR:
- index = event_type - GAUDI_EVENT_DMA_IF0_DERR;
- block_address = mmDMA_IF_W_S_BASE +
- index * (mmDMA_IF_E_S_BASE - mmDMA_IF_W_S_BASE);
- snprintf(desc, ARRAY_SIZE(desc), "DMA_IF%d", index);
- derr = true;
- num_memories = 60;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR:
- index = event_type - GAUDI_EVENT_HBM_0_SERR;
- /* HBM Registers are at different offsets */
- block_address = mmHBM0_BASE + 0x8000 +
- index * (mmHBM1_BASE - mmHBM0_BASE);
- snprintf(desc, ARRAY_SIZE(desc), "HBM%d", index);
- derr = false;
- num_memories = 64;
- disable_clock_gating = false;
- break;
- case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR:
- index = event_type - GAUDI_EVENT_HBM_0_SERR;
- /* HBM Registers are at different offsets */
- block_address = mmHBM0_BASE + 0x8000 +
- index * (mmHBM1_BASE - mmHBM0_BASE);
- snprintf(desc, ARRAY_SIZE(desc), "HBM%d", index);
- derr = true;
- num_memories = 64;
- disable_clock_gating = false;
- break;
+ params.block_address =
+ mmMME0_SBAB_BASE + index * MME_ACC_OFFSET;
+ params.num_memories = 33;
+ params.derr = true;
+ params.disable_clock_gating = true;
default:
return;
}
- gaudi_print_ecc_info_generic(hdev, desc, block_address, num_memories,
- derr, disable_clock_gating);
+ if (extract_info_from_fw) {
+ ecc_address = le64_to_cpu(ecc_data->ecc_address);
+ ecc_syndrom = le64_to_cpu(ecc_data->ecc_syndrom);
+ memory_wrapper_idx = ecc_data->memory_wrapper_idx;
+ } else {
+ rc = gaudi_extract_ecc_info(hdev, &params, &ecc_address,
+ &ecc_syndrom, &memory_wrapper_idx);
+ if (rc)
+ return;
+ }
+
+ dev_err(hdev->dev,
+ "ECC error detected. address: %#llx. Syndrom: %#llx. block id %u\n",
+ ecc_address, ecc_syndrom, memory_wrapper_idx);
}
static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type)
@@ -5568,8 +5450,6 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
event_type, desc);
- gaudi_print_ecc_info(hdev, event_type);
-
if (razwi) {
gaudi_print_razwi_info(hdev);
gaudi_print_mmu_error_info(hdev);
@@ -5718,7 +5598,7 @@ static bool gaudi_tpc_read_interrupts(struct hl_device *hdev, u8 tpc_id,
/* Clear interrupts */
WREG32(mmTPC0_CFG_TPC_INTR_CAUSE + tpc_offset, 0);
- hdev->asic_funcs->enable_clock_gating(hdev);
+ hdev->asic_funcs->set_clock_gating(hdev);
mutex_unlock(&gaudi->clk_gate_mutex);
@@ -5799,10 +5679,15 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_PSOC_CORESIGHT_DERR:
case GAUDI_EVENT_SRAM0_DERR ... GAUDI_EVENT_SRAM28_DERR:
case GAUDI_EVENT_DMA_IF0_DERR ... GAUDI_EVENT_DMA_IF3_DERR:
- fallthrough;
- case GAUDI_EVENT_GIC500:
case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR:
case GAUDI_EVENT_MMU_DERR:
+ gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
+ if (hdev->hard_reset_on_fw_events)
+ hl_device_reset(hdev, true, false);
+ break;
+
+ case GAUDI_EVENT_GIC500:
case GAUDI_EVENT_AXI_ECC:
case GAUDI_EVENT_L2_RAM_ECC:
case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
@@ -5898,6 +5783,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR:
fallthrough;
case GAUDI_EVENT_MMU_SERR:
+ gaudi_print_irq_info(hdev, event_type, true);
+ gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
+ hl_fw_unmask_irq(hdev, event_type);
+ break;
+
case GAUDI_EVENT_PCIE_DEC:
case GAUDI_EVENT_MME0_WBC_RSP:
case GAUDI_EVENT_MME0_SBAB0_RSP:
@@ -5994,6 +5884,8 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
mutex_lock(&hdev->mmu_cache_lock);
/* L0 & L1 invalidation */
+ WREG32(mmSTLB_INV_PS, 3);
+ WREG32(mmSTLB_CACHE_INV, gaudi->mmu_cache_inv_pi++);
WREG32(mmSTLB_INV_PS, 2);
rc = hl_poll_timeout(
@@ -6232,7 +6124,7 @@ static bool gaudi_is_device_idle(struct hl_device *hdev, u32 *mask,
if (s)
seq_puts(s, "\n");
- hdev->asic_funcs->enable_clock_gating(hdev);
+ hdev->asic_funcs->set_clock_gating(hdev);
mutex_unlock(&gaudi->clk_gate_mutex);
@@ -6333,7 +6225,7 @@ static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
dev_err(hdev->dev,
"Timeout while waiting for TPC%d icache prefetch\n",
tpc_id);
- hdev->asic_funcs->enable_clock_gating(hdev);
+ hdev->asic_funcs->set_clock_gating(hdev);
mutex_unlock(&gaudi->clk_gate_mutex);
return -EIO;
}
@@ -6362,7 +6254,7 @@ static int gaudi_run_tpc_kernel(struct hl_device *hdev, u64 tpc_kernel,
1000,
kernel_timeout);
- hdev->asic_funcs->enable_clock_gating(hdev);
+ hdev->asic_funcs->set_clock_gating(hdev);
mutex_unlock(&gaudi->clk_gate_mutex);
if (rc) {
@@ -6380,47 +6272,14 @@ static enum hl_device_hw_state gaudi_get_hw_state(struct hl_device *hdev)
return RREG32(mmHW_STATE);
}
-static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
-{
- return gaudi_cq_assignment[cq_idx];
-}
-
-static void gaudi_ext_queue_init(struct hl_device *hdev, u32 q_idx)
+static int gaudi_ctx_init(struct hl_ctx *ctx)
{
- struct gaudi_device *gaudi = hdev->asic_specific;
- struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
- struct hl_hw_sob *hw_sob;
- int sob, ext_idx = gaudi->ext_queue_idx++;
-
- /*
- * The external queues might not sit sequentially, hence use the
- * real external queue index for the SOB/MON base id.
- */
- hw_queue->base_sob_id = ext_idx * HL_RSVD_SOBS;
- hw_queue->base_mon_id = ext_idx * HL_RSVD_MONS;
- hw_queue->next_sob_val = 1;
- hw_queue->curr_sob_offset = 0;
-
- for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) {
- hw_sob = &hw_queue->hw_sob[sob];
- hw_sob->hdev = hdev;
- hw_sob->sob_id = hw_queue->base_sob_id + sob;
- hw_sob->q_idx = q_idx;
- kref_init(&hw_sob->kref);
- }
+ return 0;
}
-static void gaudi_ext_queue_reset(struct hl_device *hdev, u32 q_idx)
+static u32 gaudi_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
{
- struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
-
- /*
- * In case we got here due to a stuck CS, the refcnt might be bigger
- * than 1 and therefore we reset it.
- */
- kref_init(&hw_queue->hw_sob[hw_queue->curr_sob_offset].kref);
- hw_queue->curr_sob_offset = 0;
- hw_queue->next_sob_val = 1;
+ return gaudi_cq_assignment[cq_idx];
}
static u32 gaudi_get_signal_cb_size(struct hl_device *hdev)
@@ -6445,16 +6304,17 @@ static void gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id)
pkt = (struct packet_msg_short *) (uintptr_t) cb->kernel_address;
memset(pkt, 0, sizeof(*pkt));
- value = 1 << GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_SHIFT; /* inc by 1 */
- value |= 1 << GAUDI_PKT_SHORT_VAL_SOB_MOD_SHIFT; /* add mode */
+ /* Inc by 1, Mode ADD */
+ value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK, 1);
+ value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_SOB_MOD_MASK, 1);
- ctl = (sob_id * 4) << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT; /* SOB id */
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_OP_SHIFT; /* write the value */
- ctl |= 3 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S SOB base */
- ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_EB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_MB_SHIFT;
+ ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, sob_id * 4);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 3); /* W_S SOB base */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1);
pkt->value = cpu_to_le32(value);
pkt->ctl = cpu_to_le32(ctl);
@@ -6467,12 +6327,12 @@ static u32 gaudi_add_mon_msg_short(struct packet_msg_short *pkt, u32 value,
memset(pkt, 0, pkt_size);
- ctl = addr << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT;
- ctl |= 2 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S MON base */
- ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT;
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_EB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT;
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_MB_SHIFT; /* only last pkt needs MB */
+ ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, addr);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 0); /* last pkt MB */
pkt->value = cpu_to_le32(value);
pkt->ctl = cpu_to_le32(ctl);
@@ -6488,18 +6348,19 @@ static u32 gaudi_add_arm_monitor_pkt(struct packet_msg_short *pkt, u16 sob_id,
memset(pkt, 0, pkt_size);
- value = (sob_id / 8) << GAUDI_PKT_SHORT_VAL_MON_SYNC_GID_SHIFT;
- value |= sob_val << GAUDI_PKT_SHORT_VAL_MON_SYNC_VAL_SHIFT;
- value |= 0 << GAUDI_PKT_SHORT_VAL_MON_MODE_SHIFT; /* GREATER_OR_EQUAL */
- value |= mask << GAUDI_PKT_SHORT_VAL_MON_MASK_SHIFT;
+ value = FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_GID_MASK, sob_id / 8);
+ value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_SYNC_VAL_MASK, sob_val);
+ value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MODE_MASK,
+ 0); /* GREATER OR EQUAL*/
+ value |= FIELD_PREP(GAUDI_PKT_SHORT_VAL_MON_MASK_MASK, mask);
- ctl = addr << GAUDI_PKT_SHORT_CTL_ADDR_SHIFT;
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_OP_SHIFT; /* write the value */
- ctl |= 2 << GAUDI_PKT_SHORT_CTL_BASE_SHIFT; /* W_S MON base */
- ctl |= PACKET_MSG_SHORT << GAUDI_PKT_SHORT_CTL_OPCODE_SHIFT;
- ctl |= 0 << GAUDI_PKT_SHORT_CTL_EB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_RB_SHIFT;
- ctl |= 1 << GAUDI_PKT_SHORT_CTL_MB_SHIFT;
+ ctl = FIELD_PREP(GAUDI_PKT_SHORT_CTL_ADDR_MASK, addr);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 2); /* W_S MON base */
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1);
pkt->value = cpu_to_le32(value);
pkt->ctl = cpu_to_le32(ctl);
@@ -6513,15 +6374,14 @@ static u32 gaudi_add_fence_pkt(struct packet_fence *pkt)
memset(pkt, 0, pkt_size);
- cfg = 1 << GAUDI_PKT_FENCE_CFG_DEC_VAL_SHIFT;
- cfg |= 1 << GAUDI_PKT_FENCE_CFG_TARGET_VAL_SHIFT;
- cfg |= 2 << GAUDI_PKT_FENCE_CFG_ID_SHIFT;
+ cfg = FIELD_PREP(GAUDI_PKT_FENCE_CFG_DEC_VAL_MASK, 1);
+ cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_TARGET_VAL_MASK, 1);
+ cfg |= FIELD_PREP(GAUDI_PKT_FENCE_CFG_ID_MASK, 2);
- ctl = 0 << GAUDI_PKT_FENCE_CTL_PRED_SHIFT;
- ctl |= PACKET_FENCE << GAUDI_PKT_FENCE_CTL_OPCODE_SHIFT;
- ctl |= 0 << GAUDI_PKT_FENCE_CTL_EB_SHIFT;
- ctl |= 1 << GAUDI_PKT_FENCE_CTL_RB_SHIFT;
- ctl |= 1 << GAUDI_PKT_FENCE_CTL_MB_SHIFT;
+ ctl = FIELD_PREP(GAUDI_PKT_FENCE_CTL_OPCODE_MASK, PACKET_FENCE);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 0);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1);
+ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1);
pkt->cfg = cpu_to_le32(cfg);
pkt->ctl = cpu_to_le32(ctl);
@@ -6703,7 +6563,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
.mmu_invalidate_cache = gaudi_mmu_invalidate_cache,
.mmu_invalidate_cache_range = gaudi_mmu_invalidate_cache_range,
.send_heartbeat = gaudi_send_heartbeat,
- .enable_clock_gating = gaudi_enable_clock_gating,
+ .set_clock_gating = gaudi_set_clock_gating,
.disable_clock_gating = gaudi_disable_clock_gating,
.debug_coresight = gaudi_debug_coresight,
.is_device_idle = gaudi_is_device_idle,
@@ -6720,13 +6580,12 @@ static const struct hl_asic_funcs gaudi_funcs = {
.rreg = hl_rreg,
.wreg = hl_wreg,
.halt_coresight = gaudi_halt_coresight,
+ .ctx_init = gaudi_ctx_init,
.get_clk_rate = gaudi_get_clk_rate,
.get_queue_id_for_cq = gaudi_get_queue_id_for_cq,
.read_device_fw_version = gaudi_read_device_fw_version,
.load_firmware_to_device = gaudi_load_firmware_to_device,
.load_boot_fit_to_device = gaudi_load_boot_fit_to_device,
- .ext_queue_init = gaudi_ext_queue_init,
- .ext_queue_reset = gaudi_ext_queue_reset,
.get_signal_cb_size = gaudi_get_signal_cb_size,
.get_wait_cb_size = gaudi_get_wait_cb_size,
.gen_signal_cb = gaudi_gen_signal_cb,
@@ -6739,7 +6598,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
/**
* gaudi_set_asic_funcs - set GAUDI function pointers
*
- * @*hdev: pointer to hl_device structure
+ * @hdev: pointer to hl_device structure
*
*/
void gaudi_set_asic_funcs(struct hl_device *hdev)
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index a46530d375fa..5dc99f6f0296 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -9,11 +9,11 @@
#define GAUDIP_H_
#include <uapi/misc/habanalabs.h>
-#include "habanalabs.h"
-#include "include/hl_boot_if.h"
-#include "include/gaudi/gaudi_packets.h"
-#include "include/gaudi/gaudi.h"
-#include "include/gaudi/gaudi_async_events.h"
+#include "../common/habanalabs.h"
+#include "../include/common/hl_boot_if.h"
+#include "../include/gaudi/gaudi_packets.h"
+#include "../include/gaudi/gaudi.h"
+#include "../include/gaudi/gaudi_async_events.h"
#define NUMBER_OF_EXT_HW_QUEUES 12
#define NUMBER_OF_CMPLT_QUEUES NUMBER_OF_EXT_HW_QUEUES
@@ -57,6 +57,12 @@
#define GAUDI_DEFAULT_CARD_NAME "HL2000"
+#define GAUDI_MAX_PENDING_CS 1024
+
+#if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS)
+#error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1"
+#endif
+
#define PCI_DMA_NUMBER_OF_CHNLS 3
#define HBM_DMA_NUMBER_OF_CHNLS 5
#define DMA_NUMBER_OF_CHNLS (PCI_DMA_NUMBER_OF_CHNLS + \
@@ -117,14 +123,14 @@
/* Internal QMANs PQ sizes */
-#define MME_QMAN_LENGTH 64
+#define MME_QMAN_LENGTH 1024
#define MME_QMAN_SIZE_IN_BYTES (MME_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)
-#define HBM_DMA_QMAN_LENGTH 64
+#define HBM_DMA_QMAN_LENGTH 1024
#define HBM_DMA_QMAN_SIZE_IN_BYTES \
(HBM_DMA_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)
-#define TPC_QMAN_LENGTH 64
+#define TPC_QMAN_LENGTH 1024
#define TPC_QMAN_SIZE_IN_BYTES (TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE)
#define SRAM_USER_BASE_OFFSET GAUDI_DRIVER_SRAM_RESERVED_SIZE_FROM_START
@@ -228,7 +234,8 @@ struct gaudi_internal_qman_info {
* engine.
* @multi_msi_mode: whether we are working in multi MSI single MSI mode.
* Multi MSI is possible only with IOMMU enabled.
- * @ext_queue_idx: helper index for external queues initialization.
+ * @mmu_cache_inv_pi: PI for MMU cache invalidation flow. The H/W expects an
+ * 8-bit value so use u8.
*/
struct gaudi_device {
int (*armcp_info_get)(struct hl_device *hdev);
@@ -247,7 +254,7 @@ struct gaudi_device {
u32 events_stat_aggregate[GAUDI_EVENT_SIZE];
u32 hw_cap_initialized;
u8 multi_msi_mode;
- u8 ext_queue_idx;
+ u8 mmu_cache_inv_pi;
};
void gaudi_init_security(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
index bf0e062d7b87..5673ee49819e 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
@@ -6,9 +6,9 @@
*/
#include "gaudiP.h"
-#include "include/gaudi/gaudi_coresight.h"
-#include "include/gaudi/asic_reg/gaudi_regs.h"
-#include "include/gaudi/gaudi_masks.h"
+#include "../include/gaudi/gaudi_coresight.h"
+#include "../include/gaudi/asic_reg/gaudi_regs.h"
+#include "../include/gaudi/gaudi_masks.h"
#include <uapi/misc/habanalabs.h>
#include <linux/coresight.h>
@@ -392,6 +392,7 @@ static int gaudi_config_stm(struct hl_device *hdev,
{
struct hl_debug_params_stm *input;
u64 base_reg;
+ u32 frequency;
int rc;
if (params->reg_idx >= ARRAY_SIZE(debug_stm_regs)) {
@@ -420,7 +421,10 @@ static int gaudi_config_stm(struct hl_device *hdev,
WREG32(base_reg + 0xE00, lower_32_bits(input->sp_mask));
WREG32(base_reg + 0xEF4, input->id);
WREG32(base_reg + 0xDF4, 0x80);
- WREG32(base_reg + 0xE8C, input->frequency);
+ frequency = hdev->asic_prop.psoc_timestamp_frequency;
+ if (frequency == 0)
+ frequency = input->frequency;
+ WREG32(base_reg + 0xE8C, frequency);
WREG32(base_reg + 0xE90, 0x7FF);
/* SW-2176 - SW WA for HW bug */
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c b/drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c
index 6dd2c2a1cd70..1076b4932ce2 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_hwmgr.c
@@ -6,7 +6,7 @@
*/
#include "gaudiP.h"
-#include "include/gaudi/gaudi_fw_if.h"
+#include "../include/gaudi/gaudi_fw_if.h"
void gaudi_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq)
{
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_security.c b/drivers/misc/habanalabs/gaudi/gaudi_security.c
index 6a351e31fa6a..8d5d6ddee6ed 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_security.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_security.c
@@ -6,7 +6,7 @@
*/
#include "gaudiP.h"
-#include "include/gaudi/asic_reg/gaudi_regs.h"
+#include "../include/gaudi/asic_reg/gaudi_regs.h"
#define GAUDI_NUMBER_OF_RR_REGS 24
#define GAUDI_NUMBER_OF_LBW_RANGES 12
@@ -447,8 +447,7 @@ static u64 gaudi_rr_hbw_mask_high_ar_regs[GAUDI_NUMBER_OF_RR_REGS] = {
* gaudi_set_block_as_protected - set the given block as protected
*
* @hdev: pointer to hl_device structure
- * @block: block base address
- *
+ * @base: block base address
*/
static void gaudi_pb_set_block(struct hl_device *hdev, u64 base)
{
diff --git a/drivers/misc/habanalabs/goya/Makefile b/drivers/misc/habanalabs/goya/Makefile
index bd769083628e..b3f3b7b96683 100644
--- a/drivers/misc/habanalabs/goya/Makefile
+++ b/drivers/misc/habanalabs/goya/Makefile
@@ -1,5 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
-subdir-ccflags-y += -I$(src)
-
HL_GOYA_FILES := goya/goya.o goya/goya_security.o goya/goya_hwmgr.o \
goya/goya_coresight.o
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 0d2952bb58df..85030759b2af 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -6,10 +6,10 @@
*/
#include "goyaP.h"
-#include "include/hw_ip/mmu/mmu_general.h"
-#include "include/hw_ip/mmu/mmu_v1_0.h"
-#include "include/goya/asic_reg/goya_masks.h"
-#include "include/goya/goya_reg_map.h"
+#include "../include/hw_ip/mmu/mmu_general.h"
+#include "../include/hw_ip/mmu/mmu_v1_0.h"
+#include "../include/goya/asic_reg/goya_masks.h"
+#include "../include/goya/goya_reg_map.h"
#include <linux/pci.h>
#include <linux/genalloc.h>
@@ -88,6 +88,7 @@
#define GOYA_PLDM_MMU_TIMEOUT_USEC (MMU_CONFIG_TIMEOUT_USEC * 100)
#define GOYA_PLDM_QMAN0_TIMEOUT_USEC (HL_DEVICE_TIMEOUT_USEC * 30)
#define GOYA_BOOT_FIT_REQ_TIMEOUT_USEC 1000000 /* 1s */
+#define GOYA_MSG_TO_CPU_TIMEOUT_USEC 4000000 /* 4s */
#define GOYA_QMAN0_FENCE_VAL 0xD169B243
@@ -337,11 +338,19 @@ static int goya_mmu_set_dram_default_page(struct hl_device *hdev);
static int goya_mmu_add_mappings_for_device_cpu(struct hl_device *hdev);
static void goya_mmu_prepare(struct hl_device *hdev, u32 asid);
-void goya_get_fixed_properties(struct hl_device *hdev)
+int goya_get_fixed_properties(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
int i;
+ prop->max_queues = GOYA_QUEUE_ID_SIZE;
+ prop->hw_queues_props = kcalloc(prop->max_queues,
+ sizeof(struct hw_queue_properties),
+ GFP_KERNEL);
+
+ if (!prop->hw_queues_props)
+ return -ENOMEM;
+
for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
prop->hw_queues_props[i].driver_only = 0;
@@ -361,9 +370,6 @@ void goya_get_fixed_properties(struct hl_device *hdev)
prop->hw_queues_props[i].requires_kernel_cb = 0;
}
- for (; i < HL_MAX_QUEUES; i++)
- prop->hw_queues_props[i].type = QUEUE_TYPE_NA;
-
prop->completion_queues_count = NUMBER_OF_CMPLT_QUEUES;
prop->dram_base_address = DRAM_PHYS_BASE;
@@ -426,6 +432,10 @@ void goya_get_fixed_properties(struct hl_device *hdev)
strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
+
+ prop->max_pending_cs = GOYA_MAX_PENDING_CS;
+
+ return 0;
}
/*
@@ -456,6 +466,7 @@ static int goya_pci_bars_map(struct hl_device *hdev)
static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
{
struct goya_device *goya = hdev->asic_specific;
+ struct hl_inbound_pci_region pci_region;
u64 old_addr = addr;
int rc;
@@ -463,7 +474,10 @@ static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
return old_addr;
/* Inbound Region 1 - Bar 4 - Point to DDR */
- rc = hl_pci_set_dram_bar_base(hdev, 1, 4, addr);
+ pci_region.mode = PCI_BAR_MATCH_MODE;
+ pci_region.bar = DDR_BAR_ID;
+ pci_region.addr = addr;
+ rc = hl_pci_set_inbound_region(hdev, 1, &pci_region);
if (rc)
return U64_MAX;
@@ -485,8 +499,35 @@ static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
*/
static int goya_init_iatu(struct hl_device *hdev)
{
- return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE,
- HOST_PHYS_BASE, HOST_PHYS_SIZE);
+ struct hl_inbound_pci_region inbound_region;
+ struct hl_outbound_pci_region outbound_region;
+ int rc;
+
+ /* Inbound Region 0 - Bar 0 - Point to SRAM and CFG */
+ inbound_region.mode = PCI_BAR_MATCH_MODE;
+ inbound_region.bar = SRAM_CFG_BAR_ID;
+ inbound_region.addr = SRAM_BASE_ADDR;
+ rc = hl_pci_set_inbound_region(hdev, 0, &inbound_region);
+ if (rc)
+ goto done;
+
+ /* Inbound Region 1 - Bar 4 - Point to DDR */
+ inbound_region.mode = PCI_BAR_MATCH_MODE;
+ inbound_region.bar = DDR_BAR_ID;
+ inbound_region.addr = DRAM_PHYS_BASE;
+ rc = hl_pci_set_inbound_region(hdev, 1, &inbound_region);
+ if (rc)
+ goto done;
+
+ hdev->asic_funcs->set_dma_mask_from_fw(hdev);
+
+ /* Outbound Region 0 - Point to Host */
+ outbound_region.addr = HOST_PHYS_BASE;
+ outbound_region.size = HOST_PHYS_SIZE;
+ rc = hl_pci_set_outbound_region(hdev, &outbound_region);
+
+done:
+ return rc;
}
/*
@@ -507,7 +548,11 @@ static int goya_early_init(struct hl_device *hdev)
u32 val;
int rc;
- goya_get_fixed_properties(hdev);
+ rc = goya_get_fixed_properties(hdev);
+ if (rc) {
+ dev_err(hdev->dev, "Failed to get fixed properties\n");
+ return rc;
+ }
/* Check BAR sizes */
if (pci_resource_len(pdev, SRAM_CFG_BAR_ID) != CFG_BAR_SIZE) {
@@ -517,7 +562,8 @@ static int goya_early_init(struct hl_device *hdev)
(unsigned long long) pci_resource_len(pdev,
SRAM_CFG_BAR_ID),
CFG_BAR_SIZE);
- return -ENODEV;
+ rc = -ENODEV;
+ goto free_queue_props;
}
if (pci_resource_len(pdev, MSIX_BAR_ID) != MSIX_BAR_SIZE) {
@@ -527,14 +573,15 @@ static int goya_early_init(struct hl_device *hdev)
(unsigned long long) pci_resource_len(pdev,
MSIX_BAR_ID),
MSIX_BAR_SIZE);
- return -ENODEV;
+ rc = -ENODEV;
+ goto free_queue_props;
}
prop->dram_pci_bar_size = pci_resource_len(pdev, DDR_BAR_ID);
rc = hl_pci_init(hdev);
if (rc)
- return rc;
+ goto free_queue_props;
if (!hdev->pldm) {
val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS);
@@ -544,6 +591,10 @@ static int goya_early_init(struct hl_device *hdev)
}
return 0;
+
+free_queue_props:
+ kfree(hdev->asic_prop.hw_queues_props);
+ return rc;
}
/*
@@ -556,6 +607,7 @@ static int goya_early_init(struct hl_device *hdev)
*/
static int goya_early_fini(struct hl_device *hdev)
{
+ kfree(hdev->asic_prop.hw_queues_props);
hl_pci_fini(hdev);
return 0;
@@ -592,11 +644,36 @@ static void goya_qman0_set_security(struct hl_device *hdev, bool secure)
static void goya_fetch_psoc_frequency(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 trace_freq = 0;
+ u32 pll_clk = 0;
+ u32 div_fctr = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1);
+ u32 div_sel = RREG32(mmPSOC_PCI_PLL_DIV_SEL_1);
+ u32 nr = RREG32(mmPSOC_PCI_PLL_NR);
+ u32 nf = RREG32(mmPSOC_PCI_PLL_NF);
+ u32 od = RREG32(mmPSOC_PCI_PLL_OD);
+
+ if (div_sel == DIV_SEL_REF_CLK || div_sel == DIV_SEL_DIVIDED_REF) {
+ if (div_sel == DIV_SEL_REF_CLK)
+ trace_freq = PLL_REF_CLK;
+ else
+ trace_freq = PLL_REF_CLK / (div_fctr + 1);
+ } else if (div_sel == DIV_SEL_PLL_CLK ||
+ div_sel == DIV_SEL_DIVIDED_PLL) {
+ pll_clk = PLL_REF_CLK * (nf + 1) / ((nr + 1) * (od + 1));
+ if (div_sel == DIV_SEL_PLL_CLK)
+ trace_freq = pll_clk;
+ else
+ trace_freq = pll_clk / (div_fctr + 1);
+ } else {
+ dev_warn(hdev->dev,
+ "Received invalid div select value: %d", div_sel);
+ }
- prop->psoc_pci_pll_nr = RREG32(mmPSOC_PCI_PLL_NR);
- prop->psoc_pci_pll_nf = RREG32(mmPSOC_PCI_PLL_NF);
- prop->psoc_pci_pll_od = RREG32(mmPSOC_PCI_PLL_OD);
- prop->psoc_pci_pll_div_factor = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1);
+ prop->psoc_timestamp_frequency = trace_freq;
+ prop->psoc_pci_pll_nr = nr;
+ prop->psoc_pci_pll_nf = nf;
+ prop->psoc_pci_pll_od = od;
+ prop->psoc_pci_pll_div_factor = div_fctr;
}
int goya_late_init(struct hl_device *hdev)
@@ -2164,29 +2241,15 @@ static void goya_disable_timestamp(struct hl_device *hdev)
static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
{
- u32 wait_timeout_ms, cpu_timeout_ms;
+ u32 wait_timeout_ms;
dev_info(hdev->dev,
"Halting compute engines and disabling interrupts\n");
- if (hdev->pldm) {
+ if (hdev->pldm)
wait_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC;
- cpu_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC;
- } else {
+ else
wait_timeout_ms = GOYA_RESET_WAIT_MSEC;
- cpu_timeout_ms = GOYA_CPU_RESET_WAIT_MSEC;
- }
-
- if (hard_reset) {
- /*
- * I don't know what is the state of the CPU so make sure it is
- * stopped in any means necessary
- */
- WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_GOTO_WFE);
- WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
- GOYA_ASYNC_EVENT_ID_HALT_MACHINE);
- msleep(cpu_timeout_ms);
- }
goya_stop_external_queues(hdev);
goya_stop_internal_queues(hdev);
@@ -2491,14 +2554,26 @@ disable_queues:
static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
{
struct goya_device *goya = hdev->asic_specific;
- u32 reset_timeout_ms, status;
+ u32 reset_timeout_ms, cpu_timeout_ms, status;
- if (hdev->pldm)
+ if (hdev->pldm) {
reset_timeout_ms = GOYA_PLDM_RESET_TIMEOUT_MSEC;
- else
+ cpu_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC;
+ } else {
reset_timeout_ms = GOYA_RESET_TIMEOUT_MSEC;
+ cpu_timeout_ms = GOYA_CPU_RESET_WAIT_MSEC;
+ }
if (hard_reset) {
+ /* I don't know what is the state of the CPU so make sure it is
+ * stopped in any means necessary
+ */
+ WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_GOTO_WFE);
+ WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
+ GOYA_ASYNC_EVENT_ID_HALT_MACHINE);
+
+ msleep(cpu_timeout_ms);
+
goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE);
goya_disable_clk_rlx(hdev);
goya_set_pll_refclk(hdev);
@@ -2830,6 +2905,9 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
return 0;
}
+ if (!timeout)
+ timeout = GOYA_MSG_TO_CPU_TIMEOUT_USEC;
+
return hl_fw_send_cpu_message(hdev, GOYA_QUEUE_ID_CPU_PQ, msg, len,
timeout, result);
}
@@ -3697,9 +3775,8 @@ static int goya_parse_cb_mmu(struct hl_device *hdev,
parser->patched_cb_size = parser->user_cb_size +
sizeof(struct packet_msg_prot) * 2;
- rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
- parser->patched_cb_size,
- &patched_cb_handle, HL_KERNEL_ASID_ID);
+ rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
+ &patched_cb_handle, HL_KERNEL_ASID_ID, false);
if (rc) {
dev_err(hdev->dev,
@@ -3771,9 +3848,8 @@ static int goya_parse_cb_no_mmu(struct hl_device *hdev,
if (rc)
goto free_userptr;
- rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr,
- parser->patched_cb_size,
- &patched_cb_handle, HL_KERNEL_ASID_ID);
+ rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, parser->patched_cb_size,
+ &patched_cb_handle, HL_KERNEL_ASID_ID, false);
if (rc) {
dev_err(hdev->dev,
"Failed to allocate patched CB for DMA CS %d\n", rc);
@@ -3942,8 +4018,7 @@ static int goya_debugfs_read32(struct hl_device *hdev, u64 addr, u32 *val)
*val = readl(hdev->pcie_bar[SRAM_CFG_BAR_ID] +
(addr - SRAM_BASE_ADDR));
- } else if ((addr >= DRAM_PHYS_BASE) &&
- (addr < DRAM_PHYS_BASE + hdev->asic_prop.dram_size)) {
+ } else if (addr < DRAM_PHYS_BASE + hdev->asic_prop.dram_size) {
u64 bar_base_addr = DRAM_PHYS_BASE +
(addr & ~(prop->dram_pci_bar_size - 0x1ull));
@@ -3999,8 +4074,7 @@ static int goya_debugfs_write32(struct hl_device *hdev, u64 addr, u32 val)
writel(val, hdev->pcie_bar[SRAM_CFG_BAR_ID] +
(addr - SRAM_BASE_ADDR));
- } else if ((addr >= DRAM_PHYS_BASE) &&
- (addr < DRAM_PHYS_BASE + hdev->asic_prop.dram_size)) {
+ } else if (addr < DRAM_PHYS_BASE + hdev->asic_prop.dram_size) {
u64 bar_base_addr = DRAM_PHYS_BASE +
(addr & ~(prop->dram_pci_bar_size - 0x1ull));
@@ -4044,9 +4118,8 @@ static int goya_debugfs_read64(struct hl_device *hdev, u64 addr, u64 *val)
*val = readq(hdev->pcie_bar[SRAM_CFG_BAR_ID] +
(addr - SRAM_BASE_ADDR));
- } else if ((addr >= DRAM_PHYS_BASE) &&
- (addr <=
- DRAM_PHYS_BASE + hdev->asic_prop.dram_size - sizeof(u64))) {
+ } else if (addr <=
+ DRAM_PHYS_BASE + hdev->asic_prop.dram_size - sizeof(u64)) {
u64 bar_base_addr = DRAM_PHYS_BASE +
(addr & ~(prop->dram_pci_bar_size - 0x1ull));
@@ -4088,9 +4161,8 @@ static int goya_debugfs_write64(struct hl_device *hdev, u64 addr, u64 val)
writeq(val, hdev->pcie_bar[SRAM_CFG_BAR_ID] +
(addr - SRAM_BASE_ADDR));
- } else if ((addr >= DRAM_PHYS_BASE) &&
- (addr <=
- DRAM_PHYS_BASE + hdev->asic_prop.dram_size - sizeof(u64))) {
+ } else if (addr <=
+ DRAM_PHYS_BASE + hdev->asic_prop.dram_size - sizeof(u64)) {
u64 bar_base_addr = DRAM_PHYS_BASE +
(addr & ~(prop->dram_pci_bar_size - 0x1ull));
@@ -4431,8 +4503,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
- rc = goya_send_cpu_message(hdev, (u32 *) pkt, total_pkt_size,
- HL_DEVICE_TIMEOUT_USEC, &result);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
+ total_pkt_size, 0, &result);
if (rc)
dev_err(hdev->dev, "failed to unmask IRQ array\n");
@@ -4464,8 +4536,8 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
ARMCP_PKT_CTL_OPCODE_SHIFT);
pkt.value = cpu_to_le64(event_type);
- rc = goya_send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- HL_DEVICE_TIMEOUT_USEC, &result);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+ 0, &result);
if (rc)
dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
@@ -4623,7 +4695,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
lin_dma_pkts_cnt = DIV_ROUND_UP_ULL(size, SZ_2G);
cb_size = lin_dma_pkts_cnt * sizeof(struct packet_lin_dma) +
sizeof(struct packet_msg_prot);
- cb = hl_cb_kernel_create(hdev, cb_size);
+ cb = hl_cb_kernel_create(hdev, cb_size, false);
if (!cb)
return -ENOMEM;
@@ -5028,14 +5100,14 @@ int goya_armcp_info_get(struct hl_device *hdev)
return 0;
}
-static void goya_enable_clock_gating(struct hl_device *hdev)
+static void goya_set_clock_gating(struct hl_device *hdev)
{
-
+ /* clock gating not supported in Goya */
}
static void goya_disable_clock_gating(struct hl_device *hdev)
{
-
+ /* clock gating not supported in Goya */
}
static bool goya_is_device_idle(struct hl_device *hdev, u32 *mask,
@@ -5153,19 +5225,14 @@ static enum hl_device_hw_state goya_get_hw_state(struct hl_device *hdev)
return RREG32(mmHW_STATE);
}
-u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
-{
- return cq_idx;
-}
-
-static void goya_ext_queue_init(struct hl_device *hdev, u32 q_idx)
+static int goya_ctx_init(struct hl_ctx *ctx)
{
-
+ return 0;
}
-static void goya_ext_queue_reset(struct hl_device *hdev, u32 q_idx)
+u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx)
{
-
+ return cq_idx;
}
static u32 goya_get_signal_cb_size(struct hl_device *hdev)
@@ -5259,7 +5326,7 @@ static const struct hl_asic_funcs goya_funcs = {
.mmu_invalidate_cache = goya_mmu_invalidate_cache,
.mmu_invalidate_cache_range = goya_mmu_invalidate_cache_range,
.send_heartbeat = goya_send_heartbeat,
- .enable_clock_gating = goya_enable_clock_gating,
+ .set_clock_gating = goya_set_clock_gating,
.disable_clock_gating = goya_disable_clock_gating,
.debug_coresight = goya_debug_coresight,
.is_device_idle = goya_is_device_idle,
@@ -5276,13 +5343,12 @@ static const struct hl_asic_funcs goya_funcs = {
.rreg = hl_rreg,
.wreg = hl_wreg,
.halt_coresight = goya_halt_coresight,
+ .ctx_init = goya_ctx_init,
.get_clk_rate = goya_get_clk_rate,
.get_queue_id_for_cq = goya_get_queue_id_for_cq,
.read_device_fw_version = goya_read_device_fw_version,
.load_firmware_to_device = goya_load_firmware_to_device,
.load_boot_fit_to_device = goya_load_boot_fit_to_device,
- .ext_queue_init = goya_ext_queue_init,
- .ext_queue_reset = goya_ext_queue_reset,
.get_signal_cb_size = goya_get_signal_cb_size,
.get_wait_cb_size = goya_get_wait_cb_size,
.gen_signal_cb = goya_gen_signal_cb,
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index d36f8d90c9c9..bb7474ee9784 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -9,12 +9,12 @@
#define GOYAP_H_
#include <uapi/misc/habanalabs.h>
-#include "habanalabs.h"
-#include "include/hl_boot_if.h"
-#include "include/goya/goya_packets.h"
-#include "include/goya/goya.h"
-#include "include/goya/goya_async_events.h"
-#include "include/goya/goya_fw_if.h"
+#include "../common/habanalabs.h"
+#include "../include/common/hl_boot_if.h"
+#include "../include/goya/goya_packets.h"
+#include "../include/goya/goya.h"
+#include "../include/goya/goya_async_events.h"
+#include "../include/goya/goya_fw_if.h"
#define NUMBER_OF_CMPLT_QUEUES 5
#define NUMBER_OF_EXT_HW_QUEUES 5
@@ -31,10 +31,6 @@
*/
#define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + 1)
-#if (NUMBER_OF_HW_QUEUES >= HL_MAX_QUEUES)
-#error "Number of H/W queues must be smaller than HL_MAX_QUEUES"
-#endif
-
#if (NUMBER_OF_INTERRUPTS > GOYA_MSIX_ENTRIES)
#error "Number of MSIX interrupts must be smaller or equal to GOYA_MSIX_ENTRIES"
#endif
@@ -57,6 +53,12 @@
#define GOYA_DEFAULT_CARD_NAME "HL1000"
+#define GOYA_MAX_PENDING_CS 64
+
+#if !IS_MAX_PENDING_CS_VALID(GOYA_MAX_PENDING_CS)
+#error "GOYA_MAX_PENDING_CS must be power of 2 and greater than 1"
+#endif
+
/* DRAM Memory Map */
#define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */
@@ -164,7 +166,7 @@ struct goya_device {
u8 device_cpu_mmu_mappings_done;
};
-void goya_get_fixed_properties(struct hl_device *hdev);
+int goya_get_fixed_properties(struct hl_device *hdev);
int goya_mmu_init(struct hl_device *hdev);
void goya_init_dma_qmans(struct hl_device *hdev);
void goya_init_mme_qmans(struct hl_device *hdev);
diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c
index 1258724ea510..b03912483de0 100644
--- a/drivers/misc/habanalabs/goya/goya_coresight.c
+++ b/drivers/misc/habanalabs/goya/goya_coresight.c
@@ -6,9 +6,9 @@
*/
#include "goyaP.h"
-#include "include/goya/goya_coresight.h"
-#include "include/goya/asic_reg/goya_regs.h"
-#include "include/goya/asic_reg/goya_masks.h"
+#include "../include/goya/goya_coresight.h"
+#include "../include/goya/asic_reg/goya_regs.h"
+#include "../include/goya/asic_reg/goya_masks.h"
#include <uapi/misc/habanalabs.h>
@@ -232,6 +232,7 @@ static int goya_config_stm(struct hl_device *hdev,
{
struct hl_debug_params_stm *input;
u64 base_reg;
+ u32 frequency;
int rc;
if (params->reg_idx >= ARRAY_SIZE(debug_stm_regs)) {
@@ -264,7 +265,10 @@ static int goya_config_stm(struct hl_device *hdev,
WREG32(base_reg + 0xE20, 0xFFFFFFFF);
WREG32(base_reg + 0xEF4, input->id);
WREG32(base_reg + 0xDF4, 0x80);
- WREG32(base_reg + 0xE8C, input->frequency);
+ frequency = hdev->asic_prop.psoc_timestamp_frequency;
+ if (frequency == 0)
+ frequency = input->frequency;
+ WREG32(base_reg + 0xE8C, frequency);
WREG32(base_reg + 0xE90, 0x7FF);
WREG32(base_reg + 0xE80, 0x27 | (input->id << 16));
} else {
@@ -640,7 +644,6 @@ static int goya_config_spmu(struct hl_device *hdev,
int goya_debug_coresight(struct hl_device *hdev, void *data)
{
struct hl_debug_params *params = data;
- u32 val;
int rc = 0;
switch (params->op) {
@@ -672,7 +675,7 @@ int goya_debug_coresight(struct hl_device *hdev, void *data)
}
/* Perform read from the device to flush all configuration */
- val = RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG);
+ RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG);
return rc;
}
diff --git a/drivers/misc/habanalabs/goya/goya_security.c b/drivers/misc/habanalabs/goya/goya_security.c
index de8297001fea..14701836f92b 100644
--- a/drivers/misc/habanalabs/goya/goya_security.c
+++ b/drivers/misc/habanalabs/goya/goya_security.c
@@ -6,7 +6,7 @@
*/
#include "goyaP.h"
-#include "include/goya/asic_reg/goya_regs.h"
+#include "../include/goya/asic_reg/goya_regs.h"
/*
* goya_set_block_as_protected - set the given block as protected
diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/common/armcp_if.h
index a34fc39ad87e..07f9972db28d 100644
--- a/drivers/misc/habanalabs/include/armcp_if.h
+++ b/drivers/misc/habanalabs/include/common/armcp_if.h
@@ -19,9 +19,19 @@ struct hl_eq_header {
__le32 ctl;
};
+struct hl_eq_ecc_data {
+ __le64 ecc_address;
+ __le64 ecc_syndrom;
+ __u8 memory_wrapper_idx;
+ __u8 pad[7];
+};
+
struct hl_eq_entry {
struct hl_eq_header hdr;
- __le64 data[7];
+ union {
+ struct hl_eq_ecc_data ecc_data;
+ __le64 data[7];
+ };
};
#define HL_EQ_ENTRY_SIZE sizeof(struct hl_eq_entry)
@@ -276,6 +286,8 @@ struct armcp_packet {
/* For get Armcp info/EEPROM data */
__le32 data_max_size;
};
+
+ __le32 reserved;
};
struct armcp_unmask_irq_arr_packet {
diff --git a/drivers/misc/habanalabs/include/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index c22d134e73af..bb67cafc6e00 100644
--- a/drivers/misc/habanalabs/include/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -44,6 +44,15 @@
* The NIC FW loading and initialization
* failed. This means NICs are not usable.
*
+ * CPU_BOOT_ERR0_SECURITY_NOT_RDY Chip security initialization has been
+ * started, but is not ready yet - chip
+ * cannot be accessed.
+ *
+ * CPU_BOOT_ERR0_SECURITY_FAIL Security related tasks have failed.
+ * The tasks are security init (root of
+ * trust), boot authentication (chain of
+ * trust), data packets authentication.
+ *
* CPU_BOOT_ERR0_ENABLED Error registers enabled.
* This is a main indication that the
* running FW populates the error
@@ -57,6 +66,8 @@
#define CPU_BOOT_ERR0_BMC_WAIT_SKIPPED (1 << 4)
#define CPU_BOOT_ERR0_NIC_DATA_NOT_RDY (1 << 5)
#define CPU_BOOT_ERR0_NIC_FW_FAIL (1 << 6)
+#define CPU_BOOT_ERR0_SECURITY_NOT_RDY (1 << 7)
+#define CPU_BOOT_ERR0_SECURITY_FAIL (1 << 8)
#define CPU_BOOT_ERR0_ENABLED (1 << 31)
enum cpu_boot_status {
@@ -79,7 +90,10 @@ enum cpu_boot_status {
CPU_BOOT_STATUS_BMC_WAITING_SKIPPED, /* deprecated - will be removed */
/* Last boot loader progress status, ready to receive commands */
CPU_BOOT_STATUS_READY_TO_BOOT = 15,
+ /* Internal Boot finished, ready for boot-fit */
CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT = 16,
+ /* Internal Security has been initialized, device can be accessed */
+ CPU_BOOT_STATUS_SECURITY_READY = 17,
};
enum kmd_msg {
diff --git a/drivers/misc/habanalabs/include/qman_if.h b/drivers/misc/habanalabs/include/common/qman_if.h
index 0fdb49188ed7..0fdb49188ed7 100644
--- a/drivers/misc/habanalabs/include/qman_if.h
+++ b/drivers/misc/habanalabs/include/common/qman_if.h
diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
index 85e3b5148595..f92dc53af074 100644
--- a/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
+++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/gaudi_regs.h
@@ -91,18 +91,16 @@
#include "psoc_pci_pll_regs.h"
#include "psoc_hbm_pll_regs.h"
+#include "psoc_cpu_pll_regs.h"
-#define GAUDI_ECC_MEM_SEL_OFFSET 0xF18
-#define GAUDI_ECC_ADDRESS_OFFSET 0xF1C
-#define GAUDI_ECC_SYNDROME_OFFSET 0xF20
-#define GAUDI_ECC_SERR0_OFFSET 0xF30
-#define GAUDI_ECC_SERR1_OFFSET 0xF34
-#define GAUDI_ECC_SERR2_OFFSET 0xF38
-#define GAUDI_ECC_SERR3_OFFSET 0xF3C
-#define GAUDI_ECC_DERR0_OFFSET 0xF40
-#define GAUDI_ECC_DERR1_OFFSET 0xF44
-#define GAUDI_ECC_DERR2_OFFSET 0xF48
-#define GAUDI_ECC_DERR3_OFFSET 0xF4C
+#define GAUDI_ECC_MEM_SEL_OFFSET 0xF18
+#define GAUDI_ECC_ADDRESS_OFFSET 0xF1C
+#define GAUDI_ECC_SYNDROME_OFFSET 0xF20
+#define GAUDI_ECC_MEM_INFO_CLR_OFFSET 0xF28
+#define GAUDI_ECC_MEM_INFO_CLR_SERR_MASK BIT(8)
+#define GAUDI_ECC_MEM_INFO_CLR_DERR_MASK BIT(9)
+#define GAUDI_ECC_SERR0_OFFSET 0xF30
+#define GAUDI_ECC_DERR0_OFFSET 0xF40
#define mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_SOB_OBJ_0 0x492000
#define mmSYNC_MNGR_W_S_SYNC_MNGR_OBJS_MON_PAY_ADDRL_0 0x494000
@@ -294,6 +292,7 @@
#define mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG 0xC02000
+#define mmPCIE_AUX_FLR_CTRL 0xC07394
#define mmPCIE_AUX_DBI 0xC07490
#endif /* ASIC_REG_GAUDI_REGS_H_ */
diff --git a/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h b/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h
new file mode 100644
index 000000000000..2585c70f59ef
--- /dev/null
+++ b/drivers/misc/habanalabs/include/gaudi/asic_reg/psoc_cpu_pll_regs.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2016-2018 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+/************************************
+ ** This is an auto-generated file **
+ ** DO NOT EDIT BELOW **
+ ************************************/
+
+#ifndef ASIC_REG_PSOC_CPU_PLL_REGS_H_
+#define ASIC_REG_PSOC_CPU_PLL_REGS_H_
+
+/*
+ *****************************************
+ * PSOC_CPU_PLL (Prototype: PLL)
+ *****************************************
+ */
+
+#define mmPSOC_CPU_PLL_NR 0xC70100
+
+#define mmPSOC_CPU_PLL_NF 0xC70104
+
+#define mmPSOC_CPU_PLL_OD 0xC70108
+
+#define mmPSOC_CPU_PLL_NB 0xC7010C
+
+#define mmPSOC_CPU_PLL_CFG 0xC70110
+
+#define mmPSOC_CPU_PLL_LOSE_MASK 0xC70120
+
+#define mmPSOC_CPU_PLL_LOCK_INTR 0xC70128
+
+#define mmPSOC_CPU_PLL_LOCK_BYPASS 0xC7012C
+
+#define mmPSOC_CPU_PLL_DATA_CHNG 0xC70130
+
+#define mmPSOC_CPU_PLL_RST 0xC70134
+
+#define mmPSOC_CPU_PLL_SLIP_WD_CNTR 0xC70150
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_0 0xC70200
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_1 0xC70204
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_2 0xC70208
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_3 0xC7020C
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_0 0xC70220
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_1 0xC70224
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_2 0xC70228
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_CMD_3 0xC7022C
+
+#define mmPSOC_CPU_PLL_DIV_SEL_0 0xC70280
+
+#define mmPSOC_CPU_PLL_DIV_SEL_1 0xC70284
+
+#define mmPSOC_CPU_PLL_DIV_SEL_2 0xC70288
+
+#define mmPSOC_CPU_PLL_DIV_SEL_3 0xC7028C
+
+#define mmPSOC_CPU_PLL_DIV_EN_0 0xC702A0
+
+#define mmPSOC_CPU_PLL_DIV_EN_1 0xC702A4
+
+#define mmPSOC_CPU_PLL_DIV_EN_2 0xC702A8
+
+#define mmPSOC_CPU_PLL_DIV_EN_3 0xC702AC
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_0 0xC702C0
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_1 0xC702C4
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_2 0xC702C8
+
+#define mmPSOC_CPU_PLL_DIV_FACTOR_BUSY_3 0xC702CC
+
+#define mmPSOC_CPU_PLL_CLK_GATER 0xC70300
+
+#define mmPSOC_CPU_PLL_CLK_RLX_0 0xC70310
+
+#define mmPSOC_CPU_PLL_CLK_RLX_1 0xC70314
+
+#define mmPSOC_CPU_PLL_CLK_RLX_2 0xC70318
+
+#define mmPSOC_CPU_PLL_CLK_RLX_3 0xC7031C
+
+#define mmPSOC_CPU_PLL_REF_CNTR_PERIOD 0xC70400
+
+#define mmPSOC_CPU_PLL_REF_LOW_THRESHOLD 0xC70410
+
+#define mmPSOC_CPU_PLL_REF_HIGH_THRESHOLD 0xC70420
+
+#define mmPSOC_CPU_PLL_PLL_NOT_STABLE 0xC70430
+
+#define mmPSOC_CPU_PLL_FREQ_CALC_EN 0xC70440
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_CFG 0xC70500
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_0 0xC70510
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_1 0xC70514
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_2 0xC70518
+
+#define mmPSOC_CPU_PLL_RLX_BITMAP_3 0xC7051C
+
+#endif /* ASIC_REG_PSOC_CPU_PLL_REGS_H_ */
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
index 96f08050ef0f..13ef6b2887fd 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_masks.h
@@ -455,4 +455,7 @@ enum axi_id {
QM_ARB_ERR_MSG_EN_CHOISE_WDT_MASK |\
QM_ARB_ERR_MSG_EN_AXI_LBW_ERR_MASK)
+#define PCIE_AUX_FLR_CTRL_HW_CTRL_MASK 0x1
+#define PCIE_AUX_FLR_CTRL_INT_MASK_MASK 0x2
+
#endif /* GAUDI_MASKS_H_ */
diff --git a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h
index 9a5800b0086b..f30f2c0458d7 100644
--- a/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h
+++ b/drivers/misc/habanalabs/include/gaudi/gaudi_packets.h
@@ -85,7 +85,7 @@ struct packet_msg_long {
};
#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_SHIFT 0
-#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK 0x0000EFFF
+#define GAUDI_PKT_SHORT_VAL_SOB_SYNC_VAL_MASK 0x00007FFF
#define GAUDI_PKT_SHORT_VAL_SOB_MOD_SHIFT 31
#define GAUDI_PKT_SHORT_VAL_SOB_MOD_MASK 0x80000000
@@ -141,7 +141,7 @@ struct packet_msg_prot {
#define GAUDI_PKT_FENCE_CFG_TARGET_VAL_MASK 0x00FF0000
#define GAUDI_PKT_FENCE_CFG_ID_SHIFT 30
-#define GAUDI_PKT_FENCE_CFG_ID_MASK 0xC000000
+#define GAUDI_PKT_FENCE_CFG_ID_MASK 0xC0000000
#define GAUDI_PKT_FENCE_CTL_PRED_SHIFT 0
#define GAUDI_PKT_FENCE_CTL_PRED_MASK 0x0000001F
@@ -197,6 +197,9 @@ struct packet_wait {
__le32 ctl;
};
+#define GAUDI_PKT_LOAD_AND_EXE_CFG_DST_SHIFT 0
+#define GAUDI_PKT_LOAD_AND_EXE_CFG_DST_MASK 0x00000001
+
struct packet_load_and_exe {
__le32 cfg;
__le32 ctl;