Merge tag 'char-misc-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc

Pull char/misc driver updates from Greg KH: "Here is the big char/misc driver pull request for 5.4-rc1. As has been happening in previous releases, more and more individual driver subsystem trees are ending up in here. Now if that is good or bad I can't tell, but hopefully it makes your life easier as it's more of an aggregation of trees together to one merge point for you. Anyway, lots of stuff in here: - habanalabs driver updates - thunderbolt driver updates - misc driver updates - coresight and intel_th hwtracing driver updates - fpga driver updates - extcon driver updates - some dma driver updates - char driver updates - android binder driver updates - nvmem driver updates - phy driver updates - parport driver fixes - pcmcia driver fix - uio driver updates - w1 driver updates - configfs fixes - other assorted driver updates All of these have been in linux-next for a long time with no reported issues" * tag 'char-misc-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc: (200 commits) misc: mic: Use PTR_ERR_OR_ZERO rather than its implementation habanalabs: correctly cast variable to __le32 habanalabs: show correct id in error print habanalabs: stop using the acronym KMD habanalabs: display card name as sensors header habanalabs: add uapi to retrieve aggregate H/W events habanalabs: add uapi to retrieve device utilization habanalabs: Make the Coresight timestamp perpetual habanalabs: explicitly set the queue-id enumerated numbers habanalabs: print to kernel log when reset is finished habanalabs: replace __le32_to_cpu with le32_to_cpu habanalabs: replace __cpu_to_le32/64 with cpu_to_le32/64 habanalabs: Handle HW_IP_INFO if device disabled or in reset habanalabs: Expose devices after initialization is done habanalabs: improve security in Debug IOCTL habanalabs: use default structure for user input in Debug IOCTL habanalabs: Add descriptive name to PSOC app status register habanalabs: Add descriptive names to PSOC scratch-pad registers habanalabs: create two char devices per ASIC habanalabs: change device_setup_cdev() to be more generic ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2019-09-18 21:14:31 +0300
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-09-18 21:14:31 +0300
commit: 6cfae0c26b21dce323fe8799b66cf4bc996e3565 (patch)
tree: 647f80442929de7ed17cc436c546c21c8c2b2aa9 /drivers/misc/habanalabs
parent: e6874fc29410fabfdbc8c12b467f41a16cbcfd2b (diff)
parent: 16a0f687cac70301f49d6f99c4115824e6aad42b (diff)
download: linux-6cfae0c26b21dce323fe8799b66cf4bc996e3565.tar.xz
20 files changed, 1105 insertions, 556 deletions
diff --git a/drivers/misc/habanalabs/asid.c b/drivers/misc/habanalabs/asid.c
index 2c01461701a3..a2fdf31cf27c 100644
--- a/drivers/misc/habanalabs/asid.c
+++ b/drivers/misc/habanalabs/asid.c
@@ -18,7 +18,7 @@ int hl_asid_init(struct hl_device *hdev)
 
 	mutex_init(&hdev->asid_mutex);
 
-	/* ASID 0 is reserved for KMD and device CPU */
+	/* ASID 0 is reserved for the kernel driver and device CPU */
 	set_bit(0, hdev->asid_bitmap);
 
 	return 0;
diff --git a/drivers/misc/habanalabs/command_buffer.c b/drivers/misc/habanalabs/command_buffer.c
index e495f44064fa..53fddbd8e693 100644
--- a/drivers/misc/habanalabs/command_buffer.c
+++ b/drivers/misc/habanalabs/command_buffer.c
@@ -397,7 +397,8 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size)
 	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle,
 			HL_KERNEL_ASID_ID);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to allocate CB for KMD %d\n", rc);
+		dev_err(hdev->dev,
+			"Failed to allocate CB for the kernel driver %d\n", rc);
 		return NULL;
 	}
 
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index f00d1c32f6d6..a9ac045dcfde 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -178,11 +178,23 @@ static void cs_do_release(struct kref *ref)
 
 	/* We also need to update CI for internal queues */
 	if (cs->submitted) {
-		int cs_cnt = atomic_dec_return(&hdev->cs_active_cnt);
+		hdev->asic_funcs->hw_queues_lock(hdev);
 
-		WARN_ONCE((cs_cnt < 0),
-			"hl%d: error in CS active cnt %d\n",
-			hdev->id, cs_cnt);
+		hdev->cs_active_cnt--;
+		if (!hdev->cs_active_cnt) {
+			struct hl_device_idle_busy_ts *ts;
+
+			ts = &hdev->idle_busy_ts_arr[hdev->idle_busy_ts_idx++];
+			ts->busy_to_idle_ts = ktime_get();
+
+			if (hdev->idle_busy_ts_idx == HL_IDLE_BUSY_TS_ARR_SIZE)
+				hdev->idle_busy_ts_idx = 0;
+		} else if (hdev->cs_active_cnt < 0) {
+			dev_crit(hdev->dev, "CS active cnt %d is negative\n",
+				hdev->cs_active_cnt);
+		}
+
+		hdev->asic_funcs->hw_queues_unlock(hdev);
 
 		hl_int_hw_queue_update_ci(cs);
 
@@ -305,6 +317,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)];
 	if ((other) && (!dma_fence_is_signaled(other))) {
 		spin_unlock(&ctx->cs_lock);
+		dev_dbg(hdev->dev,
+			"Rejecting CS because of too many in-flights CS\n");
 		rc = -EAGAIN;
 		goto free_fence;
 	}
@@ -395,8 +409,9 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
 		return NULL;
 	}
 
-	if (hw_queue_prop->kmd_only) {
-		dev_err(hdev->dev, "Queue index %d is restricted for KMD\n",
+	if (hw_queue_prop->driver_only) {
+		dev_err(hdev->dev,
+			"Queue index %d is restricted for the kernel driver\n",
 			chunk->queue_index);
 		return NULL;
 	} else if (hw_queue_prop->type == QUEUE_TYPE_INT) {
diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c
index 8682590e3f6e..17db7b3dfb4c 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -26,12 +26,13 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		dma_fence_put(ctx->cs_pending[i]);
 
 	if (ctx->asid != HL_KERNEL_ASID_ID) {
-		/*
-		 * The engines are stopped as there is no executing CS, but the
+		/* The engines are stopped as there is no executing CS, but the
 		 * Coresight might be still working by accessing addresses
 		 * related to the stopped engines. Hence stop it explicitly.
+		 * Stop only if this is the compute context, as there can be
+		 * only one compute context
 		 */
-		if (hdev->in_debug)
+		if ((hdev->in_debug) && (hdev->compute_ctx == ctx))
 			hl_device_set_debug_mode(hdev, false);
 
 		hl_vm_ctx_fini(ctx);
@@ -67,29 +68,36 @@ int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv)
 		goto out_err;
 	}
 
+	mutex_lock(&mgr->ctx_lock);
+	rc = idr_alloc(&mgr->ctx_handles, ctx, 1, 0, GFP_KERNEL);
+	mutex_unlock(&mgr->ctx_lock);
+
+	if (rc < 0) {
+		dev_err(hdev->dev, "Failed to allocate IDR for a new CTX\n");
+		goto free_ctx;
+	}
+
+	ctx->handle = rc;
+
 	rc = hl_ctx_init(hdev, ctx, false);
 	if (rc)
-		goto free_ctx;
+		goto remove_from_idr;
 
 	hl_hpriv_get(hpriv);
 	ctx->hpriv = hpriv;
 
-	/* TODO: remove for multiple contexts */
+	/* TODO: remove for multiple contexts per process */
 	hpriv->ctx = ctx;
-	hdev->user_ctx = ctx;
 
-	mutex_lock(&mgr->ctx_lock);
-	rc = idr_alloc(&mgr->ctx_handles, ctx, 1, 0, GFP_KERNEL);
-	mutex_unlock(&mgr->ctx_lock);
-
-	if (rc < 0) {
-		dev_err(hdev->dev, "Failed to allocate IDR for a new CTX\n");
-		hl_ctx_free(hdev, ctx);
-		goto out_err;
-	}
+	/* TODO: remove the following line for multiple process support */
+	hdev->compute_ctx = ctx;
 
 	return 0;
 
+remove_from_idr:
+	mutex_lock(&mgr->ctx_lock);
+	idr_remove(&mgr->ctx_handles, ctx->handle);
+	mutex_unlock(&mgr->ctx_lock);
 free_ctx:
 	kfree(ctx);
 out_err:
@@ -120,7 +128,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 	ctx->thread_ctx_switch_wait_token = 0;
 
 	if (is_kernel_ctx) {
-		ctx->asid = HL_KERNEL_ASID_ID; /* KMD gets ASID 0 */
+		ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */
 		rc = hl_mmu_ctx_init(ctx);
 		if (rc) {
 			dev_err(hdev->dev, "Failed to init mmu ctx module\n");
diff --git a/drivers/misc/habanalabs/debugfs.c b/drivers/misc/habanalabs/debugfs.c
index 18e499c900c7..87f37ac31ccd 100644
--- a/drivers/misc/habanalabs/debugfs.c
+++ b/drivers/misc/habanalabs/debugfs.c
@@ -29,7 +29,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_I2C_RD <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_RD <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
@@ -55,12 +55,12 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_I2C_WR <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_WR <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
 	pkt.i2c_reg = i2c_reg;
-	pkt.value = __cpu_to_le64(val);
+	pkt.value = cpu_to_le64(val);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 					HL_DEVICE_TIMEOUT_USEC, NULL);
@@ -81,10 +81,10 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_LED_SET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_LED_SET <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
-	pkt.led_index = __cpu_to_le32(led);
-	pkt.value = __cpu_to_le64(state);
+	pkt.led_index = cpu_to_le32(led);
+	pkt.value = cpu_to_le64(state);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						HL_DEVICE_TIMEOUT_USEC, NULL);
@@ -370,7 +370,7 @@ static int mmu_show(struct seq_file *s, void *data)
 	if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID)
 		ctx = hdev->kernel_ctx;
 	else
-		ctx = hdev->user_ctx;
+		ctx = hdev->compute_ctx;
 
 	if (!ctx) {
 		dev_err(hdev->dev, "no ctx available\n");
@@ -533,7 +533,7 @@ out:
 static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 				u64 *phys_addr)
 {
-	struct hl_ctx *ctx = hdev->user_ctx;
+	struct hl_ctx *ctx = hdev->compute_ctx;
 	u64 hop_addr, hop_pte_addr, hop_pte;
 	u64 offset_mask = HOP4_MASK | OFFSET_MASK;
 	int rc = 0;
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 7a8f9d0b71b5..459fee70a597 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -42,10 +42,12 @@ static void hpriv_release(struct kref *ref)
 {
 	struct hl_fpriv *hpriv;
 	struct hl_device *hdev;
+	struct hl_ctx *ctx;
 
 	hpriv = container_of(ref, struct hl_fpriv, refcount);
 
 	hdev = hpriv->hdev;
+	ctx = hpriv->ctx;
 
 	put_pid(hpriv->taskpid);
 
@@ -53,13 +55,12 @@ static void hpriv_release(struct kref *ref)
 
 	mutex_destroy(&hpriv->restore_phase_mutex);
 
-	kfree(hpriv);
-
-	/* Now the FD is really closed */
-	atomic_dec(&hdev->fd_open_cnt);
+	mutex_lock(&hdev->fpriv_list_lock);
+	list_del(&hpriv->dev_node);
+	hdev->compute_ctx = NULL;
+	mutex_unlock(&hdev->fpriv_list_lock);
 
-	/* This allows a new user context to open the device */
-	hdev->user_ctx = NULL;
+	kfree(hpriv);
 }
 
 void hl_hpriv_get(struct hl_fpriv *hpriv)
@@ -94,6 +95,24 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
+{
+	struct hl_fpriv *hpriv = filp->private_data;
+	struct hl_device *hdev;
+
+	filp->private_data = NULL;
+
+	hdev = hpriv->hdev;
+
+	mutex_lock(&hdev->fpriv_list_lock);
+	list_del(&hpriv->dev_node);
+	mutex_unlock(&hdev->fpriv_list_lock);
+
+	kfree(hpriv);
+
+	return 0;
+}
+
 /*
  * hl_mmap - mmap function for habanalabs device
  *
@@ -124,55 +143,102 @@ static const struct file_operations hl_ops = {
 	.compat_ioctl = hl_ioctl
 };
 
+static const struct file_operations hl_ctrl_ops = {
+	.owner = THIS_MODULE,
+	.open = hl_device_open_ctrl,
+	.release = hl_device_release_ctrl,
+	.unlocked_ioctl = hl_ioctl_control,
+	.compat_ioctl = hl_ioctl_control
+};
+
+static void device_release_func(struct device *dev)
+{
+	kfree(dev);
+}
+
 /*
- * device_setup_cdev - setup cdev and device for habanalabs device
+ * device_init_cdev - Initialize cdev and device for habanalabs device
  *
  * @hdev: pointer to habanalabs device structure
  * @hclass: pointer to the class object of the device
  * @minor: minor number of the specific device
- * @fpos : file operations to install for this device
+ * @fpos: file operations to install for this device
+ * @name: name of the device as it will appear in the filesystem
+ * @cdev: pointer to the char device object that will be initialized
+ * @dev: pointer to the device object that will be initialized
  *
- * Create a cdev and a Linux device for habanalabs's device. Need to be
- * called at the end of the habanalabs device initialization process,
- * because this function exposes the device to the user
+ * Initialize a cdev and a Linux device for habanalabs's device.
  */
-static int device_setup_cdev(struct hl_device *hdev, struct class *hclass,
-				int minor, const struct file_operations *fops)
+static int device_init_cdev(struct hl_device *hdev, struct class *hclass,
+				int minor, const struct file_operations *fops,
+				char *name, struct cdev *cdev,
+				struct device **dev)
 {
-	int err, devno = MKDEV(hdev->major, minor);
-	struct cdev *hdev_cdev = &hdev->cdev;
-	char *name;
+	cdev_init(cdev, fops);
+	cdev->owner = THIS_MODULE;
 
-	name = kasprintf(GFP_KERNEL, "hl%d", hdev->id);
-	if (!name)
+	*dev = kzalloc(sizeof(**dev), GFP_KERNEL);
+	if (!*dev)
 		return -ENOMEM;
 
-	cdev_init(hdev_cdev, fops);
-	hdev_cdev->owner = THIS_MODULE;
-	err = cdev_add(hdev_cdev, devno, 1);
-	if (err) {
-		pr_err("Failed to add char device %s\n", name);
-		goto err_cdev_add;
+	device_initialize(*dev);
+	(*dev)->devt = MKDEV(hdev->major, minor);
+	(*dev)->class = hclass;
+	(*dev)->release = device_release_func;
+	dev_set_drvdata(*dev, hdev);
+	dev_set_name(*dev, "%s", name);
+
+	return 0;
+}
+
+static int device_cdev_sysfs_add(struct hl_device *hdev)
+{
+	int rc;
+
+	rc = cdev_device_add(&hdev->cdev, hdev->dev);
+	if (rc) {
+		dev_err(hdev->dev,
+			"failed to add a char device to the system\n");
+		return rc;
 	}
 
-	hdev->dev = device_create(hclass, NULL, devno, NULL, "%s", name);
-	if (IS_ERR(hdev->dev)) {
-		pr_err("Failed to create device %s\n", name);
-		err = PTR_ERR(hdev->dev);
-		goto err_device_create;
+	rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl);
+	if (rc) {
+		dev_err(hdev->dev,
+			"failed to add a control char device to the system\n");
+		goto delete_cdev_device;
 	}
 
-	dev_set_drvdata(hdev->dev, hdev);
+	/* hl_sysfs_init() must be done after adding the device to the system */
+	rc = hl_sysfs_init(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "failed to initialize sysfs\n");
+		goto delete_ctrl_cdev_device;
+	}
 
-	kfree(name);
+	hdev->cdev_sysfs_created = true;
 
 	return 0;
 
-err_device_create:
-	cdev_del(hdev_cdev);
-err_cdev_add:
-	kfree(name);
-	return err;
+delete_ctrl_cdev_device:
+	cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
+delete_cdev_device:
+	cdev_device_del(&hdev->cdev, hdev->dev);
+	return rc;
+}
+
+static void device_cdev_sysfs_del(struct hl_device *hdev)
+{
+	/* device_release() won't be called so must free devices explicitly */
+	if (!hdev->cdev_sysfs_created) {
+		kfree(hdev->dev_ctrl);
+		kfree(hdev->dev);
+		return;
+	}
+
+	hl_sysfs_fini(hdev);
+	cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
+	cdev_device_del(&hdev->cdev, hdev->dev);
 }
 
 /*
@@ -227,20 +293,29 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_eq_wq;
 	}
 
+	hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE,
+					sizeof(struct hl_device_idle_busy_ts),
+					(GFP_KERNEL | __GFP_ZERO));
+	if (!hdev->idle_busy_ts_arr) {
+		rc = -ENOMEM;
+		goto free_chip_info;
+	}
+
 	hl_cb_mgr_init(&hdev->kernel_cb_mgr);
 
-	mutex_init(&hdev->fd_open_cnt_lock);
 	mutex_init(&hdev->send_cpu_message_lock);
 	mutex_init(&hdev->debug_lock);
 	mutex_init(&hdev->mmu_cache_lock);
 	INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
 	spin_lock_init(&hdev->hw_queues_mirror_lock);
+	INIT_LIST_HEAD(&hdev->fpriv_list);
+	mutex_init(&hdev->fpriv_list_lock);
 	atomic_set(&hdev->in_reset, 0);
-	atomic_set(&hdev->fd_open_cnt, 0);
-	atomic_set(&hdev->cs_active_cnt, 0);
 
 	return 0;
 
+free_chip_info:
+	kfree(hdev->hl_chip_info);
 free_eq_wq:
 	destroy_workqueue(hdev->eq_wq);
 free_cq_wq:
@@ -266,8 +341,11 @@ static void device_early_fini(struct hl_device *hdev)
 	mutex_destroy(&hdev->debug_lock);
 	mutex_destroy(&hdev->send_cpu_message_lock);
 
+	mutex_destroy(&hdev->fpriv_list_lock);
+
 	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
 
+	kfree(hdev->idle_busy_ts_arr);
 	kfree(hdev->hl_chip_info);
 
 	destroy_workqueue(hdev->eq_wq);
@@ -277,8 +355,6 @@ static void device_early_fini(struct hl_device *hdev)
 
 	if (hdev->asic_funcs->early_fini)
 		hdev->asic_funcs->early_fini(hdev);
-
-	mutex_destroy(&hdev->fd_open_cnt_lock);
 }
 
 static void set_freq_to_low_job(struct work_struct *work)
@@ -286,9 +362,13 @@ static void set_freq_to_low_job(struct work_struct *work)
 	struct hl_device *hdev = container_of(work, struct hl_device,
 						work_freq.work);
 
-	if (atomic_read(&hdev->fd_open_cnt) == 0)
+	mutex_lock(&hdev->fpriv_list_lock);
+
+	if (!hdev->compute_ctx)
 		hl_device_set_frequency(hdev, PLL_LOW);
 
+	mutex_unlock(&hdev->fpriv_list_lock);
+
 	schedule_delayed_work(&hdev->work_freq,
 			usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
 }
@@ -338,7 +418,7 @@ static int device_late_init(struct hl_device *hdev)
 	hdev->high_pll = hdev->asic_prop.high_pll;
 
 	/* force setting to low frequency */
-	atomic_set(&hdev->curr_pll_profile, PLL_LOW);
+	hdev->curr_pll_profile = PLL_LOW;
 
 	if (hdev->pm_mng_profile == PM_AUTO)
 		hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
@@ -381,44 +461,128 @@ static void device_late_fini(struct hl_device *hdev)
 	hdev->late_init_done = false;
 }
 
+uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms)
+{
+	struct hl_device_idle_busy_ts *ts;
+	ktime_t zero_ktime, curr = ktime_get();
+	u32 overlap_cnt = 0, last_index = hdev->idle_busy_ts_idx;
+	s64 period_us, last_start_us, last_end_us, last_busy_time_us,
+		total_busy_time_us = 0, total_busy_time_ms;
+
+	zero_ktime = ktime_set(0, 0);
+	period_us = period_ms * USEC_PER_MSEC;
+	ts = &hdev->idle_busy_ts_arr[last_index];
+
+	/* check case that device is currently in idle */
+	if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime) &&
+			!ktime_compare(ts->idle_to_busy_ts, zero_ktime)) {
+
+		last_index--;
+		/* Handle case idle_busy_ts_idx was 0 */
+		if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
+			last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
+
+		ts = &hdev->idle_busy_ts_arr[last_index];
+	}
+
+	while (overlap_cnt < HL_IDLE_BUSY_TS_ARR_SIZE) {
+		/* Check if we are in last sample case. i.e. if the sample
+		 * begun before the sampling period. This could be a real
+		 * sample or 0 so need to handle both cases
+		 */
+		last_start_us = ktime_to_us(
+				ktime_sub(curr, ts->idle_to_busy_ts));
+
+		if (last_start_us > period_us) {
+
+			/* First check two cases:
+			 * 1. If the device is currently busy
+			 * 2. If the device was idle during the whole sampling
+			 *    period
+			 */
+
+			if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime)) {
+				/* Check if the device is currently busy */
+				if (ktime_compare(ts->idle_to_busy_ts,
+						zero_ktime))
+					return 100;
+
+				/* We either didn't have any activity or we
+				 * reached an entry which is 0. Either way,
+				 * exit and return what was accumulated so far
+				 */
+				break;
+			}
+
+			/* If sample has finished, check it is relevant */
+			last_end_us = ktime_to_us(
+					ktime_sub(curr, ts->busy_to_idle_ts));
+
+			if (last_end_us > period_us)
+				break;
+
+			/* It is relevant so add it but with adjustment */
+			last_busy_time_us = ktime_to_us(
+						ktime_sub(ts->busy_to_idle_ts,
+						ts->idle_to_busy_ts));
+			total_busy_time_us += last_busy_time_us -
+					(last_start_us - period_us);
+			break;
+		}
+
+		/* Check if the sample is finished or still open */
+		if (ktime_compare(ts->busy_to_idle_ts, zero_ktime))
+			last_busy_time_us = ktime_to_us(
+						ktime_sub(ts->busy_to_idle_ts,
+						ts->idle_to_busy_ts));
+		else
+			last_busy_time_us = ktime_to_us(
+					ktime_sub(curr, ts->idle_to_busy_ts));
+
+		total_busy_time_us += last_busy_time_us;
+
+		last_index--;
+		/* Handle case idle_busy_ts_idx was 0 */
+		if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
+			last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
+
+		ts = &hdev->idle_busy_ts_arr[last_index];
+
+		overlap_cnt++;
+	}
+
+	total_busy_time_ms = DIV_ROUND_UP_ULL(total_busy_time_us,
+						USEC_PER_MSEC);
+
+	return DIV_ROUND_UP_ULL(total_busy_time_ms * 100, period_ms);
+}
+
 /*
  * hl_device_set_frequency - set the frequency of the device
  *
  * @hdev: pointer to habanalabs device structure
  * @freq: the new frequency value
  *
- * Change the frequency if needed.
- * We allose to set PLL to low only if there is no user process
- * Returns 0 if no change was done, otherwise returns 1;
+ * Change the frequency if needed. This function has no protection against
+ * concurrency, therefore it is assumed that the calling function has protected
+ * itself against the case of calling this function from multiple threads with
+ * different values
+ *
+ * Returns 0 if no change was done, otherwise returns 1
  */
 int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
 {
-	enum hl_pll_frequency old_freq =
-			(freq == PLL_HIGH) ? PLL_LOW : PLL_HIGH;
-	int ret;
-
-	if (hdev->pm_mng_profile == PM_MANUAL)
-		return 0;
-
-	ret = atomic_cmpxchg(&hdev->curr_pll_profile, old_freq, freq);
-	if (ret == freq)
+	if ((hdev->pm_mng_profile == PM_MANUAL) ||
+			(hdev->curr_pll_profile == freq))
 		return 0;
 
-	/*
-	 * in case we want to lower frequency, check if device is not
-	 * opened. We must have a check here to workaround race condition with
-	 * hl_device_open
-	 */
-	if ((freq == PLL_LOW) && (atomic_read(&hdev->fd_open_cnt) > 0)) {
-		atomic_set(&hdev->curr_pll_profile, PLL_HIGH);
-		return 0;
-	}
-
 	dev_dbg(hdev->dev, "Changing device frequency to %s\n",
 		freq == PLL_HIGH ? "high" : "low");
 
 	hdev->asic_funcs->set_pll_profile(hdev, freq);
 
+	hdev->curr_pll_profile = freq;
+
 	return 1;
 }
 
@@ -449,19 +613,8 @@ int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
 		goto out;
 	}
 
-	mutex_lock(&hdev->fd_open_cnt_lock);
-
-	if (atomic_read(&hdev->fd_open_cnt) > 1) {
-		dev_err(hdev->dev,
-			"Failed to enable debug mode. More then a single user is using the device\n");
-		rc = -EPERM;
-		goto unlock_fd_open_lock;
-	}
-
 	hdev->in_debug = 1;
 
-unlock_fd_open_lock:
-	mutex_unlock(&hdev->fd_open_cnt_lock);
 out:
 	mutex_unlock(&hdev->debug_lock);
 
@@ -568,6 +721,7 @@ disable_device:
 static void device_kill_open_processes(struct hl_device *hdev)
 {
 	u16 pending_total, pending_cnt;
+	struct hl_fpriv	*hpriv;
 	struct task_struct *task = NULL;
 
 	if (hdev->pldm)
@@ -575,32 +729,31 @@ static void device_kill_open_processes(struct hl_device *hdev)
 	else
 		pending_total = HL_PENDING_RESET_PER_SEC;
 
-	pending_cnt = pending_total;
-
-	/* Flush all processes that are inside hl_open */
-	mutex_lock(&hdev->fd_open_cnt_lock);
-
-	while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {
-
-		pending_cnt--;
-
-		dev_info(hdev->dev,
-			"Can't HARD reset, waiting for user to close FD\n");
+	/* Giving time for user to close FD, and for processes that are inside
+	 * hl_device_open to finish
+	 */
+	if (!list_empty(&hdev->fpriv_list))
 		ssleep(1);
-	}
 
-	if (atomic_read(&hdev->fd_open_cnt)) {
-		task = get_pid_task(hdev->user_ctx->hpriv->taskpid,
-					PIDTYPE_PID);
+	mutex_lock(&hdev->fpriv_list_lock);
+
+	/* This section must be protected because we are dereferencing
+	 * pointers that are freed if the process exits
+	 */
+	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) {
+		task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
 		if (task) {
-			dev_info(hdev->dev, "Killing user processes\n");
+			dev_info(hdev->dev, "Killing user process pid=%d\n",
+				task_pid_nr(task));
 			send_sig(SIGKILL, task, 1);
-			msleep(100);
+			usleep_range(1000, 10000);
 
 			put_task_struct(task);
 		}
 	}
 
+	mutex_unlock(&hdev->fpriv_list_lock);
+
 	/* We killed the open users, but because the driver cleans up after the
 	 * user contexts are closed (e.g. mmu mappings), we need to wait again
 	 * to make sure the cleaning phase is finished before continuing with
@@ -609,19 +762,18 @@ static void device_kill_open_processes(struct hl_device *hdev)
 
 	pending_cnt = pending_total;
 
-	while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {
+	while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
+		dev_info(hdev->dev,
+			"Waiting for all unmap operations to finish before hard reset\n");
 
 		pending_cnt--;
 
 		ssleep(1);
 	}
 
-	if (atomic_read(&hdev->fd_open_cnt))
+	if (!list_empty(&hdev->fpriv_list))
 		dev_crit(hdev->dev,
 			"Going to hard reset with open user contexts\n");
-
-	mutex_unlock(&hdev->fd_open_cnt_lock);
-
 }
 
 static void device_hard_reset_pending(struct work_struct *work)
@@ -630,8 +782,6 @@ static void device_hard_reset_pending(struct work_struct *work)
 		container_of(work, struct hl_device_reset_work, reset_work);
 	struct hl_device *hdev = device_reset_work->hdev;
 
-	device_kill_open_processes(hdev);
-
 	hl_device_reset(hdev, true, true);
 
 	kfree(device_reset_work);
@@ -679,13 +829,16 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 		/* This also blocks future CS/VM/JOB completion operations */
 		hdev->disabled = true;
 
-		/*
-		 * Flush anyone that is inside the critical section of enqueue
+		/* Flush anyone that is inside the critical section of enqueue
 		 * jobs to the H/W
 		 */
 		hdev->asic_funcs->hw_queues_lock(hdev);
 		hdev->asic_funcs->hw_queues_unlock(hdev);
 
+		/* Flush anyone that is inside device open */
+		mutex_lock(&hdev->fpriv_list_lock);
+		mutex_unlock(&hdev->fpriv_list_lock);
+
 		dev_err(hdev->dev, "Going to RESET device!\n");
 	}
 
@@ -736,6 +889,13 @@ again:
 	/* Go over all the queues, release all CS and their jobs */
 	hl_cs_rollback_all(hdev);
 
+	/* Kill processes here after CS rollback. This is because the process
+	 * can't really exit until all its CSs are done, which is what we
+	 * do in cs rollback
+	 */
+	if (from_hard_reset_thread)
+		device_kill_open_processes(hdev);
+
 	/* Release kernel context */
 	if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
 		hdev->kernel_ctx = NULL;
@@ -754,12 +914,24 @@ again:
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 		hl_cq_reset(hdev, &hdev->completion_queue[i]);
 
+	hdev->idle_busy_ts_idx = 0;
+	hdev->idle_busy_ts_arr[0].busy_to_idle_ts = ktime_set(0, 0);
+	hdev->idle_busy_ts_arr[0].idle_to_busy_ts = ktime_set(0, 0);
+
+	if (hdev->cs_active_cnt)
+		dev_crit(hdev->dev, "CS active cnt %d is not 0 during reset\n",
+			hdev->cs_active_cnt);
+
+	mutex_lock(&hdev->fpriv_list_lock);
+
 	/* Make sure the context switch phase will run again */
-	if (hdev->user_ctx) {
-		atomic_set(&hdev->user_ctx->thread_ctx_switch_token, 1);
-		hdev->user_ctx->thread_ctx_switch_wait_token = 0;
+	if (hdev->compute_ctx) {
+		atomic_set(&hdev->compute_ctx->thread_ctx_switch_token, 1);
+		hdev->compute_ctx->thread_ctx_switch_wait_token = 0;
 	}
 
+	mutex_unlock(&hdev->fpriv_list_lock);
+
 	/* Finished tear-down, starting to re-initialize */
 
 	if (hard_reset) {
@@ -788,7 +960,7 @@ again:
 			goto out_err;
 		}
 
-		hdev->user_ctx = NULL;
+		hdev->compute_ctx = NULL;
 
 		rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
 		if (rc) {
@@ -849,6 +1021,8 @@ again:
 	else
 		hdev->soft_reset_cnt++;
 
+	dev_warn(hdev->dev, "Successfully finished resetting the device\n");
+
 	return 0;
 
 out_err:
@@ -883,17 +1057,43 @@ out_err:
 int hl_device_init(struct hl_device *hdev, struct class *hclass)
 {
 	int i, rc, cq_ready_cnt;
+	char *name;
+	bool add_cdev_sysfs_on_err = false;
 
-	/* Create device */
-	rc = device_setup_cdev(hdev, hclass, hdev->id, &hl_ops);
+	name = kasprintf(GFP_KERNEL, "hl%d", hdev->id / 2);
+	if (!name) {
+		rc = -ENOMEM;
+		goto out_disabled;
+	}
+
+	/* Initialize cdev and device structures */
+	rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name,
+				&hdev->cdev, &hdev->dev);
+
+	kfree(name);
 
 	if (rc)
 		goto out_disabled;
 
+	name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->id / 2);
+	if (!name) {
+		rc = -ENOMEM;
+		goto free_dev;
+	}
+
+	/* Initialize cdev and device structures for control device */
+	rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops,
+				name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
+
+	kfree(name);
+
+	if (rc)
+		goto free_dev;
+
 	/* Initialize ASIC function pointers and perform early init */
 	rc = device_early_init(hdev);
 	if (rc)
-		goto release_device;
+		goto free_dev_ctrl;
 
 	/*
 	 * Start calling ASIC initialization. First S/W then H/W and finally
@@ -965,7 +1165,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		goto mmu_fini;
 	}
 
-	hdev->user_ctx = NULL;
+	hdev->compute_ctx = NULL;
 
 	rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
 	if (rc) {
@@ -980,12 +1180,6 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		goto release_ctx;
 	}
 
-	rc = hl_sysfs_init(hdev);
-	if (rc) {
-		dev_err(hdev->dev, "failed to initialize sysfs\n");
-		goto free_cb_pool;
-	}
-
 	hl_debugfs_add_device(hdev);
 
 	if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
@@ -994,6 +1188,12 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		hdev->asic_funcs->hw_fini(hdev, true);
 	}
 
+	/*
+	 * From this point, in case of an error, add char devices and create
+	 * sysfs nodes as part of the error flow, to allow debugging.
+	 */
+	add_cdev_sysfs_on_err = true;
+
 	rc = hdev->asic_funcs->hw_init(hdev);
 	if (rc) {
 		dev_err(hdev->dev, "failed to initialize the H/W\n");
@@ -1030,9 +1230,24 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	}
 
 	/*
-	 * hl_hwmon_init must be called after device_late_init, because only
+	 * Expose devices and sysfs nodes to user.
+	 * From here there is no need to add char devices and create sysfs nodes
+	 * in case of an error.
+	 */
+	add_cdev_sysfs_on_err = false;
+	rc = device_cdev_sysfs_add(hdev);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to add char devices and sysfs nodes\n");
+		rc = 0;
+		goto out_disabled;
+	}
+
+	/*
+	 * hl_hwmon_init() must be called after device_late_init(), because only
 	 * there we get the information from the device about which
-	 * hwmon-related sensors the device supports
+	 * hwmon-related sensors the device supports.
+	 * Furthermore, it must be done after adding the device to the system.
 	 */
 	rc = hl_hwmon_init(hdev);
 	if (rc) {
@@ -1048,8 +1263,6 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 
 	return 0;
 
-free_cb_pool:
-	hl_cb_pool_fini(hdev);
 release_ctx:
 	if (hl_ctx_put(hdev->kernel_ctx) != 1)
 		dev_err(hdev->dev,
@@ -1068,18 +1281,21 @@ sw_fini:
 	hdev->asic_funcs->sw_fini(hdev);
 early_fini:
 	device_early_fini(hdev);
-release_device:
-	device_destroy(hclass, hdev->dev->devt);
-	cdev_del(&hdev->cdev);
+free_dev_ctrl:
+	kfree(hdev->dev_ctrl);
+free_dev:
+	kfree(hdev->dev);
 out_disabled:
 	hdev->disabled = true;
+	if (add_cdev_sysfs_on_err)
+		device_cdev_sysfs_add(hdev);
 	if (hdev->pdev)
 		dev_err(&hdev->pdev->dev,
 			"Failed to initialize hl%d. Device is NOT usable !\n",
-			hdev->id);
+			hdev->id / 2);
 	else
 		pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
-			hdev->id);
+			hdev->id / 2);
 
 	return rc;
 }
@@ -1120,16 +1336,17 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Mark device as disabled */
 	hdev->disabled = true;
 
-	/*
-	 * Flush anyone that is inside the critical section of enqueue
+	/* Flush anyone that is inside the critical section of enqueue
 	 * jobs to the H/W
 	 */
 	hdev->asic_funcs->hw_queues_lock(hdev);
 	hdev->asic_funcs->hw_queues_unlock(hdev);
 
-	hdev->hard_reset_pending = true;
+	/* Flush anyone that is inside device open */
+	mutex_lock(&hdev->fpriv_list_lock);
+	mutex_unlock(&hdev->fpriv_list_lock);
 
-	device_kill_open_processes(hdev);
+	hdev->hard_reset_pending = true;
 
 	hl_hwmon_fini(hdev);
 
@@ -1137,8 +1354,6 @@ void hl_device_fini(struct hl_device *hdev)
 
 	hl_debugfs_remove_device(hdev);
 
-	hl_sysfs_fini(hdev);
-
 	/*
 	 * Halt the engines and disable interrupts so we won't get any more
 	 * completions from H/W and we won't have any accesses from the
@@ -1149,6 +1364,12 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Go over all the queues, release all CS and their jobs */
 	hl_cs_rollback_all(hdev);
 
+	/* Kill processes here after CS rollback. This is because the process
+	 * can't really exit until all its CSs are done, which is what we
+	 * do in cs rollback
+	 */
+	device_kill_open_processes(hdev);
+
 	hl_cb_pool_fini(hdev);
 
 	/* Release kernel context */
@@ -1175,9 +1396,8 @@ void hl_device_fini(struct hl_device *hdev)
 
 	device_early_fini(hdev);
 
-	/* Hide device from user */
-	device_destroy(hdev->dev->class, hdev->dev->devt);
-	cdev_del(&hdev->cdev);
+	/* Hide devices and sysfs nodes from user */
+	device_cdev_sysfs_del(hdev);
 
 	pr_info("removed device successfully\n");
 }
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 271c5c8f53b4..6fba14b81f90 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -9,6 +9,7 @@
 #include "include/hw_ip/mmu/mmu_general.h"
 #include "include/hw_ip/mmu/mmu_v1_0.h"
 #include "include/goya/asic_reg/goya_masks.h"
+#include "include/goya/goya_reg_map.h"
 
 #include <linux/pci.h>
 #include <linux/genalloc.h>
@@ -41,8 +42,8 @@
  * PQ, CQ and CP are not secured.
  * PQ, CB and the data are on the SRAM/DRAM.
  *
- * Since QMAN DMA is secured, KMD is parsing the DMA CB:
- *     - KMD checks DMA pointer
+ * Since QMAN DMA is secured, the driver is parsing the DMA CB:
+ *     - checks DMA pointer
  *     - WREG, MSG_PROT are not allowed.
  *     - MSG_LONG/SHORT are allowed.
  *
@@ -55,15 +56,15 @@
  * QMAN DMA: PQ, CQ and CP are secured.
  * MMU is set to bypass on the Secure props register of the QMAN.
  * The reasons we don't enable MMU for PQ, CQ and CP are:
- *     - PQ entry is in kernel address space and KMD doesn't map it.
+ *     - PQ entry is in kernel address space and the driver doesn't map it.
  *     - CP writes to MSIX register and to kernel address space (completion
  *       queue).
  *
- * DMA is not secured but because CP is secured, KMD still needs to parse the
- * CB, but doesn't need to check the DMA addresses.
+ * DMA is not secured but because CP is secured, the driver still needs to parse
+ * the CB, but doesn't need to check the DMA addresses.
  *
- * For QMAN DMA 0, DMA is also secured because only KMD uses this DMA and KMD
- * doesn't map memory in MMU.
+ * For QMAN DMA 0, DMA is also secured because only the driver uses this DMA and
+ * the driver doesn't map memory in MMU.
  *
  * QMAN TPC/MME: PQ, CQ and CP aren't secured (no change from MMU disabled mode)
  *
@@ -335,18 +336,18 @@ void goya_get_fixed_properties(struct hl_device *hdev)
 
 	for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
-		prop->hw_queues_props[i].kmd_only = 0;
+		prop->hw_queues_props[i].driver_only = 0;
 	}
 
 	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
-		prop->hw_queues_props[i].kmd_only = 1;
+		prop->hw_queues_props[i].driver_only = 1;
 	}
 
 	for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES +
 			NUMBER_OF_INT_HW_QUEUES; i++) {
 		prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
-		prop->hw_queues_props[i].kmd_only = 0;
+		prop->hw_queues_props[i].driver_only = 0;
 	}
 
 	for (; i < HL_MAX_QUEUES; i++)
@@ -1006,36 +1007,34 @@ int goya_init_cpu_queues(struct hl_device *hdev)
 
 	eq = &hdev->event_queue;
 
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0,
-			lower_32_bits(cpu_pq->bus_address));
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1,
-			upper_32_bits(cpu_pq->bus_address));
+	WREG32(mmCPU_PQ_BASE_ADDR_LOW, lower_32_bits(cpu_pq->bus_address));
+	WREG32(mmCPU_PQ_BASE_ADDR_HIGH, upper_32_bits(cpu_pq->bus_address));
 
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_2, lower_32_bits(eq->bus_address));
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(eq->bus_address));
+	WREG32(mmCPU_EQ_BASE_ADDR_LOW, lower_32_bits(eq->bus_address));
+	WREG32(mmCPU_EQ_BASE_ADDR_HIGH, upper_32_bits(eq->bus_address));
 
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8,
+	WREG32(mmCPU_CQ_BASE_ADDR_LOW,
 			lower_32_bits(VA_CPU_ACCESSIBLE_MEM_ADDR));
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9,
+	WREG32(mmCPU_CQ_BASE_ADDR_HIGH,
 			upper_32_bits(VA_CPU_ACCESSIBLE_MEM_ADDR));
 
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES);
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_4, HL_EQ_SIZE_IN_BYTES);
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_10, HL_CPU_ACCESSIBLE_MEM_SIZE);
+	WREG32(mmCPU_PQ_LENGTH, HL_QUEUE_SIZE_IN_BYTES);
+	WREG32(mmCPU_EQ_LENGTH, HL_EQ_SIZE_IN_BYTES);
+	WREG32(mmCPU_CQ_LENGTH, HL_CPU_ACCESSIBLE_MEM_SIZE);
 
 	/* Used for EQ CI */
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, 0);
+	WREG32(mmCPU_EQ_CI, 0);
 
 	WREG32(mmCPU_IF_PF_PQ_PI, 0);
 
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_7, PQ_INIT_STATUS_READY_FOR_CP);
+	WREG32(mmCPU_PQ_INIT_STATUS, PQ_INIT_STATUS_READY_FOR_CP);
 
 	WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
 			GOYA_ASYNC_EVENT_ID_PI_UPDATE);
 
 	err = hl_poll_timeout(
 		hdev,
-		mmPSOC_GLOBAL_CONF_SCRATCHPAD_7,
+		mmCPU_PQ_INIT_STATUS,
 		status,
 		(status == PQ_INIT_STATUS_READY_FOR_HOST),
 		1000,
@@ -2063,6 +2062,25 @@ static void goya_disable_msix(struct hl_device *hdev)
 	goya->hw_cap_initialized &= ~HW_CAP_MSIX;
 }
 
+static void goya_enable_timestamp(struct hl_device *hdev)
+{
+	/* Disable the timestamp counter */
+	WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
+
+	/* Zero the lower/upper parts of the 64-bit counter */
+	WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE + 0xC, 0);
+	WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE + 0x8, 0);
+
+	/* Enable the counter */
+	WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 1);
+}
+
+static void goya_disable_timestamp(struct hl_device *hdev)
+{
+	/* Disable the timestamp counter */
+	WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
+}
+
 static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
 {
 	u32 wait_timeout_ms, cpu_timeout_ms;
@@ -2103,6 +2121,8 @@ static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
 	goya_disable_external_queues(hdev);
 	goya_disable_internal_queues(hdev);
 
+	goya_disable_timestamp(hdev);
+
 	if (hard_reset) {
 		goya_disable_msix(hdev);
 		goya_mmu_remove_device_cpu_mappings(hdev);
@@ -2205,12 +2225,12 @@ static void goya_read_device_fw_version(struct hl_device *hdev,
 
 	switch (fwc) {
 	case FW_COMP_UBOOT:
-		ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_29);
+		ver_off = RREG32(mmUBOOT_VER_OFFSET);
 		dest = hdev->asic_prop.uboot_ver;
 		name = "U-Boot";
 		break;
 	case FW_COMP_PREBOOT:
-		ver_off = RREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_28);
+		ver_off = RREG32(mmPREBOOT_VER_OFFSET);
 		dest = hdev->asic_prop.preboot_ver;
 		name = "Preboot";
 		break;
@@ -2469,7 +2489,7 @@ static int goya_hw_init(struct hl_device *hdev)
 	 * we need to reset the chip before doing H/W init. This register is
 	 * cleared by the H/W upon H/W reset
 	 */
-	WREG32(mmPSOC_GLOBAL_CONF_APP_STATUS, HL_DEVICE_HW_STATE_DIRTY);
+	WREG32(mmHW_STATE, HL_DEVICE_HW_STATE_DIRTY);
 
 	rc = goya_init_cpu(hdev, GOYA_CPU_TIMEOUT_USEC);
 	if (rc) {
@@ -2505,6 +2525,8 @@ static int goya_hw_init(struct hl_device *hdev)
 
 	goya_init_tpc_qmans(hdev);
 
+	goya_enable_timestamp(hdev);
+
 	/* MSI-X must be enabled before CPU queues are initialized */
 	rc = goya_enable_msix(hdev);
 	if (rc)
@@ -2831,7 +2853,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
 
 	if (!hdev->asic_funcs->is_device_idle(hdev, NULL, NULL)) {
 		dev_err_ratelimited(hdev->dev,
-			"Can't send KMD job on QMAN0 because the device is not idle\n");
+			"Can't send driver job on QMAN0 because the device is not idle\n");
 		return -EBUSY;
 	}
 
@@ -3949,7 +3971,7 @@ void goya_add_end_of_cb_packets(struct hl_device *hdev, u64 kernel_address,
 
 void goya_update_eq_ci(struct hl_device *hdev, u32 val)
 {
-	WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_6, val);
+	WREG32(mmCPU_EQ_CI, val);
 }
 
 void goya_restore_phase_topology(struct hl_device *hdev)
@@ -4447,6 +4469,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 	struct goya_device *goya = hdev->asic_specific;
 
 	goya->events_stat[event_type]++;
+	goya->events_stat_aggregate[event_type]++;
 
 	switch (event_type) {
 	case GOYA_ASYNC_EVENT_ID_PCIE_IF:
@@ -4528,12 +4551,16 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 	}
 }
 
-void *goya_get_events_stat(struct hl_device *hdev, u32 *size)
+void *goya_get_events_stat(struct hl_device *hdev, bool aggregate, u32 *size)
 {
 	struct goya_device *goya = hdev->asic_specific;
 
-	*size = (u32) sizeof(goya->events_stat);
+	if (aggregate) {
+		*size = (u32) sizeof(goya->events_stat_aggregate);
+		return goya->events_stat_aggregate;
+	}
 
+	*size = (u32) sizeof(goya->events_stat);
 	return goya->events_stat;
 }
 
@@ -4934,6 +4961,10 @@ int goya_armcp_info_get(struct hl_device *hdev)
 		prop->dram_end_address = prop->dram_base_address + dram_size;
 	}
 
+	if (!strlen(prop->armcp_info.card_name))
+		strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
+				CARD_NAME_MAX_LEN);
+
 	return 0;
 }
 
@@ -5047,7 +5078,7 @@ static int goya_get_eeprom_data(struct hl_device *hdev, void *data,
 
 static enum hl_device_hw_state goya_get_hw_state(struct hl_device *hdev)
 {
-	return RREG32(mmPSOC_GLOBAL_CONF_APP_STATUS);
+	return RREG32(mmHW_STATE);
 }
 
 static const struct hl_asic_funcs goya_funcs = {
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index d7f48c9c41cd..89b6574f8e4f 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -55,6 +55,8 @@
 
 #define DRAM_PHYS_DEFAULT_SIZE		0x100000000ull	/* 4GB */
 
+#define GOYA_DEFAULT_CARD_NAME		"HL1000"
+
 /* DRAM Memory Map */
 
 #define CPU_FW_IMAGE_SIZE		0x10000000	/* 256MB */
@@ -68,19 +70,19 @@
 						MMU_PAGE_TABLES_SIZE)
 #define MMU_CACHE_MNG_ADDR		(MMU_DRAM_DEFAULT_PAGE_ADDR + \
 					MMU_DRAM_DEFAULT_PAGE_SIZE)
-#define DRAM_KMD_END_ADDR		(MMU_CACHE_MNG_ADDR + \
+#define DRAM_DRIVER_END_ADDR		(MMU_CACHE_MNG_ADDR + \
 						MMU_CACHE_MNG_SIZE)
 
 #define DRAM_BASE_ADDR_USER		0x20000000
 
-#if (DRAM_KMD_END_ADDR > DRAM_BASE_ADDR_USER)
-#error "KMD must reserve no more than 512MB"
+#if (DRAM_DRIVER_END_ADDR > DRAM_BASE_ADDR_USER)
+#error "Driver must reserve no more than 512MB"
 #endif
 
 /*
- * SRAM Memory Map for KMD
+ * SRAM Memory Map for Driver
  *
- * KMD occupies KMD_SRAM_SIZE bytes from the start of SRAM. It is used for
+ * Driver occupies DRIVER_SRAM_SIZE bytes from the start of SRAM. It is used for
  * MME/TPC QMANs
  *
  */
@@ -106,10 +108,10 @@
 #define TPC7_QMAN_BASE_OFFSET	(TPC6_QMAN_BASE_OFFSET + \
 				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
 
-#define SRAM_KMD_RES_OFFSET	(TPC7_QMAN_BASE_OFFSET + \
+#define SRAM_DRIVER_RES_OFFSET	(TPC7_QMAN_BASE_OFFSET + \
 				(TPC_QMAN_LENGTH * QMAN_PQ_ENTRY_SIZE))
 
-#if (SRAM_KMD_RES_OFFSET >= GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START)
+#if (SRAM_DRIVER_RES_OFFSET >= GOYA_KMD_SRAM_RESERVED_SIZE_FROM_START)
 #error "MME/TPC QMANs SRAM space exceeds limit"
 #endif
 
@@ -162,6 +164,7 @@ struct goya_device {
 
 	u64		ddr_bar_cur_addr;
 	u32		events_stat[GOYA_ASYNC_EVENT_ID_SIZE];
+	u32		events_stat_aggregate[GOYA_ASYNC_EVENT_ID_SIZE];
 	u32		hw_cap_initialized;
 	u8		device_cpu_mmu_mappings_done;
 };
@@ -215,7 +218,7 @@ int goya_suspend(struct hl_device *hdev);
 int goya_resume(struct hl_device *hdev);
 
 void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry);
-void *goya_get_events_stat(struct hl_device *hdev, u32 *size);
+void *goya_get_events_stat(struct hl_device *hdev, bool aggregate, u32 *size);
 
 void goya_add_end_of_cb_packets(struct hl_device *hdev, u64 kernel_address,
 				u32 len, u64 cq_addr, u32 cq_val, u32 msix_vec);
diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c
index d7ec7ad84cc6..b4d406af1bed 100644
--- a/drivers/misc/habanalabs/goya/goya_coresight.c
+++ b/drivers/misc/habanalabs/goya/goya_coresight.c
@@ -15,6 +15,10 @@
 
 #define GOYA_PLDM_CORESIGHT_TIMEOUT_USEC	(CORESIGHT_TIMEOUT_USEC * 100)
 
+#define SPMU_SECTION_SIZE		DMA_CH_0_CS_SPMU_MAX_OFFSET
+#define SPMU_EVENT_TYPES_OFFSET		0x400
+#define SPMU_MAX_COUNTERS		6
+
 static u64 debug_stm_regs[GOYA_STM_LAST + 1] = {
 	[GOYA_STM_CPU]		= mmCPU_STM_BASE,
 	[GOYA_STM_DMA_CH_0_CS]	= mmDMA_CH_0_CS_STM_BASE,
@@ -226,9 +230,16 @@ static int goya_config_stm(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
 	struct hl_debug_params_stm *input;
-	u64 base_reg = debug_stm_regs[params->reg_idx] - CFG_BASE;
+	u64 base_reg;
 	int rc;
 
+	if (params->reg_idx >= ARRAY_SIZE(debug_stm_regs)) {
+		dev_err(hdev->dev, "Invalid register index in STM\n");
+		return -EINVAL;
+	}
+
+	base_reg = debug_stm_regs[params->reg_idx] - CFG_BASE;
+
 	WREG32(base_reg + 0xFB0, CORESIGHT_UNLOCK);
 
 	if (params->enable) {
@@ -288,10 +299,17 @@ static int goya_config_etf(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
 	struct hl_debug_params_etf *input;
-	u64 base_reg = debug_etf_regs[params->reg_idx] - CFG_BASE;
+	u64 base_reg;
 	u32 val;
 	int rc;
 
+	if (params->reg_idx >= ARRAY_SIZE(debug_etf_regs)) {
+		dev_err(hdev->dev, "Invalid register index in ETF\n");
+		return -EINVAL;
+	}
+
+	base_reg = debug_etf_regs[params->reg_idx] - CFG_BASE;
+
 	WREG32(base_reg + 0xFB0, CORESIGHT_UNLOCK);
 
 	val = RREG32(base_reg + 0x304);
@@ -445,11 +463,18 @@ static int goya_config_etr(struct hl_device *hdev,
 static int goya_config_funnel(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
-	WREG32(debug_funnel_regs[params->reg_idx] - CFG_BASE + 0xFB0,
-			CORESIGHT_UNLOCK);
+	u64 base_reg;
+
+	if (params->reg_idx >= ARRAY_SIZE(debug_funnel_regs)) {
+		dev_err(hdev->dev, "Invalid register index in FUNNEL\n");
+		return -EINVAL;
+	}
+
+	base_reg = debug_funnel_regs[params->reg_idx] - CFG_BASE;
+
+	WREG32(base_reg + 0xFB0, CORESIGHT_UNLOCK);
 
-	WREG32(debug_funnel_regs[params->reg_idx] - CFG_BASE,
-			params->enable ? 0x33F : 0);
+	WREG32(base_reg, params->enable ? 0x33F : 0);
 
 	return 0;
 }
@@ -458,9 +483,16 @@ static int goya_config_bmon(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
 	struct hl_debug_params_bmon *input;
-	u64 base_reg = debug_bmon_regs[params->reg_idx] - CFG_BASE;
+	u64 base_reg;
 	u32 pcie_base = 0;
 
+	if (params->reg_idx >= ARRAY_SIZE(debug_bmon_regs)) {
+		dev_err(hdev->dev, "Invalid register index in BMON\n");
+		return -EINVAL;
+	}
+
+	base_reg = debug_bmon_regs[params->reg_idx] - CFG_BASE;
+
 	WREG32(base_reg + 0x104, 1);
 
 	if (params->enable) {
@@ -522,7 +554,7 @@ static int goya_config_bmon(struct hl_device *hdev,
 static int goya_config_spmu(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
-	u64 base_reg = debug_spmu_regs[params->reg_idx] - CFG_BASE;
+	u64 base_reg;
 	struct hl_debug_params_spmu *input = params->input;
 	u64 *output;
 	u32 output_arr_len;
@@ -531,6 +563,13 @@ static int goya_config_spmu(struct hl_device *hdev,
 	u32 cycle_cnt_idx;
 	int i;
 
+	if (params->reg_idx >= ARRAY_SIZE(debug_spmu_regs)) {
+		dev_err(hdev->dev, "Invalid register index in SPMU\n");
+		return -EINVAL;
+	}
+
+	base_reg = debug_spmu_regs[params->reg_idx] - CFG_BASE;
+
 	if (params->enable) {
 		input = params->input;
 
@@ -539,7 +578,13 @@ static int goya_config_spmu(struct hl_device *hdev,
 
 		if (input->event_types_num < 3) {
 			dev_err(hdev->dev,
-				"not enough values for SPMU enable\n");
+				"not enough event types values for SPMU enable\n");
+			return -EINVAL;
+		}
+
+		if (input->event_types_num > SPMU_MAX_COUNTERS) {
+			dev_err(hdev->dev,
+				"too many event types values for SPMU enable\n");
 			return -EINVAL;
 		}
 
@@ -547,7 +592,8 @@ static int goya_config_spmu(struct hl_device *hdev,
 		WREG32(base_reg + 0xE04, 0x41013040);
 
 		for (i = 0 ; i < input->event_types_num ; i++)
-			WREG32(base_reg + 0x400 + i * 4, input->event_types[i]);
+			WREG32(base_reg + SPMU_EVENT_TYPES_OFFSET + i * 4,
+				input->event_types[i]);
 
 		WREG32(base_reg + 0xE04, 0x41013041);
 		WREG32(base_reg + 0xC00, 0x8000003F);
@@ -567,6 +613,12 @@ static int goya_config_spmu(struct hl_device *hdev,
 			return -EINVAL;
 		}
 
+		if (events_num > SPMU_MAX_COUNTERS) {
+			dev_err(hdev->dev,
+				"too many events values for SPMU disable\n");
+			return -EINVAL;
+		}
+
 		WREG32(base_reg + 0xE04, 0x41013040);
 
 		for (i = 0 ; i < events_num ; i++)
@@ -584,24 +636,11 @@ static int goya_config_spmu(struct hl_device *hdev,
 	return 0;
 }
 
-static int goya_config_timestamp(struct hl_device *hdev,
-		struct hl_debug_params *params)
-{
-	WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
-	if (params->enable) {
-		WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE + 0xC, 0);
-		WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE + 0x8, 0);
-		WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 1);
-	}
-
-	return 0;
-}
-
 int goya_debug_coresight(struct hl_device *hdev, void *data)
 {
 	struct hl_debug_params *params = data;
 	u32 val;
-	int rc;
+	int rc = 0;
 
 	switch (params->op) {
 	case HL_DEBUG_OP_STM:
@@ -623,7 +662,7 @@ int goya_debug_coresight(struct hl_device *hdev, void *data)
 		rc = goya_config_spmu(hdev, params);
 		break;
 	case HL_DEBUG_OP_TIMESTAMP:
-		rc = goya_config_timestamp(hdev, params);
+		/* Do nothing as this opcode is deprecated */
 		break;
 
 	default:
diff --git a/drivers/misc/habanalabs/goya/goya_hwmgr.c b/drivers/misc/habanalabs/goya/goya_hwmgr.c
index 088692c852b6..a2a700c3d597 100644
--- a/drivers/misc/habanalabs/goya/goya_hwmgr.c
+++ b/drivers/misc/habanalabs/goya/goya_hwmgr.c
@@ -230,18 +230,127 @@ static ssize_t ic_clk_curr_show(struct device *dev,
 	return sprintf(buf, "%lu\n", value);
 }
 
+static ssize_t pm_mng_profile_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	if (hl_device_disabled_or_in_reset(hdev))
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n",
+			(hdev->pm_mng_profile == PM_AUTO) ? "auto" :
+			(hdev->pm_mng_profile == PM_MANUAL) ? "manual" :
+			"unknown");
+}
+
+static ssize_t pm_mng_profile_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	if (hl_device_disabled_or_in_reset(hdev)) {
+		count = -ENODEV;
+		goto out;
+	}
+
+	mutex_lock(&hdev->fpriv_list_lock);
+
+	if (hdev->compute_ctx) {
+		dev_err(hdev->dev,
+			"Can't change PM profile while compute context is opened on the device\n");
+		count = -EPERM;
+		goto unlock_mutex;
+	}
+
+	if (strncmp("auto", buf, strlen("auto")) == 0) {
+		/* Make sure we are in LOW PLL when changing modes */
+		if (hdev->pm_mng_profile == PM_MANUAL) {
+			hdev->curr_pll_profile = PLL_HIGH;
+			hl_device_set_frequency(hdev, PLL_LOW);
+			hdev->pm_mng_profile = PM_AUTO;
+		}
+	} else if (strncmp("manual", buf, strlen("manual")) == 0) {
+		if (hdev->pm_mng_profile == PM_AUTO) {
+			/* Must release the lock because the work thread also
+			 * takes this lock. But before we release it, set
+			 * the mode to manual so nothing will change if a user
+			 * suddenly opens the device
+			 */
+			hdev->pm_mng_profile = PM_MANUAL;
+
+			mutex_unlock(&hdev->fpriv_list_lock);
+
+			/* Flush the current work so we can return to the user
+			 * knowing that he is the only one changing frequencies
+			 */
+			flush_delayed_work(&hdev->work_freq);
+
+			return count;
+		}
+	} else {
+		dev_err(hdev->dev, "value should be auto or manual\n");
+		count = -EINVAL;
+	}
+
+unlock_mutex:
+	mutex_unlock(&hdev->fpriv_list_lock);
+out:
+	return count;
+}
+
+static ssize_t high_pll_show(struct device *dev, struct device_attribute *attr,
+				char *buf)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	if (hl_device_disabled_or_in_reset(hdev))
+		return -ENODEV;
+
+	return sprintf(buf, "%u\n", hdev->high_pll);
+}
+
+static ssize_t high_pll_store(struct device *dev, struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+	long value;
+	int rc;
+
+	if (hl_device_disabled_or_in_reset(hdev)) {
+		count = -ENODEV;
+		goto out;
+	}
+
+	rc = kstrtoul(buf, 0, &value);
+
+	if (rc) {
+		count = -EINVAL;
+		goto out;
+	}
+
+	hdev->high_pll = value;
+
+out:
+	return count;
+}
+
+static DEVICE_ATTR_RW(high_pll);
 static DEVICE_ATTR_RW(ic_clk);
 static DEVICE_ATTR_RO(ic_clk_curr);
 static DEVICE_ATTR_RW(mme_clk);
 static DEVICE_ATTR_RO(mme_clk_curr);
+static DEVICE_ATTR_RW(pm_mng_profile);
 static DEVICE_ATTR_RW(tpc_clk);
 static DEVICE_ATTR_RO(tpc_clk_curr);
 
 static struct attribute *goya_dev_attrs[] = {
+	&dev_attr_high_pll.attr,
 	&dev_attr_ic_clk.attr,
 	&dev_attr_ic_clk_curr.attr,
 	&dev_attr_mme_clk.attr,
 	&dev_attr_mme_clk_curr.attr,
+	&dev_attr_pm_mng_profile.attr,
 	&dev_attr_tpc_clk.attr,
 	&dev_attr_tpc_clk_curr.attr,
 	NULL,
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index ce83adafcf2d..75862be53c60 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -36,6 +36,8 @@
 
 #define HL_PCI_ELBI_TIMEOUT_MSEC	10 /* 10ms */
 
+#define HL_SIM_MAX_TIMEOUT_US		10000000 /* 10s */
+
 #define HL_MAX_QUEUES			128
 
 #define HL_MAX_JOBS_PER_CS		64
@@ -43,6 +45,8 @@
 /* MUST BE POWER OF 2 and larger than 1 */
 #define HL_MAX_PENDING_CS		64
 
+#define HL_IDLE_BUSY_TS_ARR_SIZE	4096
+
 /* Memory */
 #define MEM_HASH_TABLE_BITS		7 /* 1 << 7 buckets */
 
@@ -92,12 +96,12 @@ enum hl_queue_type {
 /**
  * struct hw_queue_properties - queue information.
  * @type: queue type.
- * @kmd_only: true if only KMD is allowed to send a job to this queue, false
- *            otherwise.
+ * @driver_only: true if only the driver is allowed to send a job to this queue,
+ *               false otherwise.
  */
 struct hw_queue_properties {
 	enum hl_queue_type	type;
-	u8			kmd_only;
+	u8			driver_only;
 };
 
 /**
@@ -320,7 +324,7 @@ struct hl_cs_job;
 #define HL_EQ_LENGTH			64
 #define HL_EQ_SIZE_IN_BYTES		(HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE)
 
-/* KMD <-> ArmCP shared memory size */
+/* Host <-> ArmCP shared memory size */
 #define HL_CPU_ACCESSIBLE_MEM_SIZE	SZ_2M
 
 /**
@@ -401,7 +405,7 @@ struct hl_cs_parser;
 
 /**
  * enum hl_pm_mng_profile - power management profile.
- * @PM_AUTO: internal clock is set by KMD.
+ * @PM_AUTO: internal clock is set by the Linux driver.
  * @PM_MANUAL: internal clock is set by the user.
  * @PM_LAST: last power management type.
  */
@@ -554,7 +558,8 @@ struct hl_asic_funcs {
 				struct hl_eq_entry *eq_entry);
 	void (*set_pll_profile)(struct hl_device *hdev,
 			enum hl_pll_frequency freq);
-	void* (*get_events_stat)(struct hl_device *hdev, u32 *size);
+	void* (*get_events_stat)(struct hl_device *hdev, bool aggregate,
+				u32 *size);
 	u64 (*read_pte)(struct hl_device *hdev, u64 addr);
 	void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val);
 	void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard);
@@ -608,7 +613,7 @@ struct hl_va_range {
  *		descriptor (hl_vm_phys_pg_list or hl_userptr).
  * @mmu_phys_hash: holds a mapping from physical address to pgt_info structure.
  * @mmu_shadow_hash: holds a mapping from shadow address to pgt_info structure.
- * @hpriv: pointer to the private (KMD) data of the process (fd).
+ * @hpriv: pointer to the private (Kernel Driver) data of the process (fd).
  * @hdev: pointer to the device structure.
  * @refcount: reference counter for the context. Context is released only when
  *		this hits 0l. It is incremented on CS and CS_WAIT.
@@ -634,6 +639,7 @@ struct hl_va_range {
  *				execution phase before the context switch phase
  *				has finished.
  * @asid: context's unique address space ID in the device's MMU.
+ * @handle: context's opaque handle for user
  */
 struct hl_ctx {
 	DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS);
@@ -655,6 +661,7 @@ struct hl_ctx {
 	atomic_t		thread_ctx_switch_token;
 	u32			thread_ctx_switch_wait_token;
 	u32			asid;
+	u32			handle;
 };
 
 /**
@@ -906,23 +913,27 @@ struct hl_debug_params {
  * @hdev: habanalabs device structure.
  * @filp: pointer to the given file structure.
  * @taskpid: current process ID.
- * @ctx: current executing context.
+ * @ctx: current executing context. TODO: remove for multiple ctx per process
  * @ctx_mgr: context manager to handle multiple context for this FD.
  * @cb_mgr: command buffer manager to handle multiple buffers for this FD.
  * @debugfs_list: list of relevant ASIC debugfs.
+ * @dev_node: node in the device list of file private data
  * @refcount: number of related contexts.
  * @restore_phase_mutex: lock for context switch and restore phase.
+ * @is_control: true for control device, false otherwise
  */
 struct hl_fpriv {
 	struct hl_device	*hdev;
 	struct file		*filp;
 	struct pid		*taskpid;
-	struct hl_ctx		*ctx; /* TODO: remove for multiple ctx */
+	struct hl_ctx		*ctx;
 	struct hl_ctx_mgr	ctx_mgr;
 	struct hl_cb_mgr	cb_mgr;
 	struct list_head	debugfs_list;
+	struct list_head	dev_node;
 	struct kref		refcount;
 	struct mutex		restore_phase_mutex;
+	u8			is_control;
 };
 
 
@@ -1009,7 +1020,7 @@ struct hl_dbg_device_entry {
  */
 
 /* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe
- * x16 cards. In extereme cases, there are hosts that can accommodate 16 cards
+ * x16 cards. In extreme cases, there are hosts that can accommodate 16 cards.
  */
 #define HL_MAX_MINORS	256
 
@@ -1041,14 +1052,18 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 	WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \
 			(val) << REG_FIELD_SHIFT(reg, field))
 
+/* Timeout should be longer when working with simulator but cap the
+ * increased timeout to some maximum
+ */
 #define hl_poll_timeout(hdev, addr, val, cond, sleep_us, timeout_us) \
 ({ \
 	ktime_t __timeout; \
-	/* timeout should be longer when working with simulator */ \
 	if (hdev->pdev) \
 		__timeout = ktime_add_us(ktime_get(), timeout_us); \
 	else \
-		__timeout = ktime_add_us(ktime_get(), (timeout_us * 10)); \
+		__timeout = ktime_add_us(ktime_get(),\
+				min((u64)(timeout_us * 10), \
+					(u64) HL_SIM_MAX_TIMEOUT_US)); \
 	might_sleep_if(sleep_us); \
 	for (;;) { \
 		(val) = RREG32(addr); \
@@ -1080,24 +1095,25 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 				mem_written_by_device) \
 ({ \
 	ktime_t __timeout; \
-	/* timeout should be longer when working with simulator */ \
 	if (hdev->pdev) \
 		__timeout = ktime_add_us(ktime_get(), timeout_us); \
 	else \
-		__timeout = ktime_add_us(ktime_get(), (timeout_us * 10)); \
+		__timeout = ktime_add_us(ktime_get(),\
+				min((u64)(timeout_us * 10), \
+					(u64) HL_SIM_MAX_TIMEOUT_US)); \
 	might_sleep_if(sleep_us); \
 	for (;;) { \
 		/* Verify we read updates done by other cores or by device */ \
 		mb(); \
 		(val) = *((u32 *) (uintptr_t) (addr)); \
 		if (mem_written_by_device) \
-			(val) = le32_to_cpu(val); \
+			(val) = le32_to_cpu(*(__le32 *) &(val)); \
 		if (cond) \
 			break; \
 		if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \
 			(val) = *((u32 *) (uintptr_t) (addr)); \
 			if (mem_written_by_device) \
-				(val) = le32_to_cpu(val); \
+				(val) = le32_to_cpu(*(__le32 *) &(val)); \
 			break; \
 		} \
 		if (sleep_us) \
@@ -1110,11 +1126,12 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 					timeout_us) \
 ({ \
 	ktime_t __timeout; \
-	/* timeout should be longer when working with simulator */ \
 	if (hdev->pdev) \
 		__timeout = ktime_add_us(ktime_get(), timeout_us); \
 	else \
-		__timeout = ktime_add_us(ktime_get(), (timeout_us * 10)); \
+		__timeout = ktime_add_us(ktime_get(),\
+				min((u64)(timeout_us * 10), \
+					(u64) HL_SIM_MAX_TIMEOUT_US)); \
 	might_sleep_if(sleep_us); \
 	for (;;) { \
 		(val) = readl(addr); \
@@ -1143,12 +1160,24 @@ struct hl_device_reset_work {
 };
 
 /**
+ * struct hl_device_idle_busy_ts - used for calculating device utilization rate.
+ * @idle_to_busy_ts: timestamp where device changed from idle to busy.
+ * @busy_to_idle_ts: timestamp where device changed from busy to idle.
+ */
+struct hl_device_idle_busy_ts {
+	ktime_t				idle_to_busy_ts;
+	ktime_t				busy_to_idle_ts;
+};
+
+/**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
  * @pcie_bar: array of available PCIe bars.
  * @rmmio: configuration area address on SRAM.
  * @cdev: related char device.
- * @dev: realted kernel basic device structure.
+ * @cdev_ctrl: char device for control operations only (INFO IOCTL)
+ * @dev: related kernel basic device structure.
+ * @dev_ctrl: related kernel device structure for the control device
  * @work_freq: delayed work to lower device frequency if possible.
  * @work_heartbeat: delayed work for ArmCP is-alive check.
  * @asic_name: ASIC specific nmae.
@@ -1156,25 +1185,19 @@ struct hl_device_reset_work {
  * @completion_queue: array of hl_cq.
  * @cq_wq: work queue of completion queues for executing work in process context
  * @eq_wq: work queue of event queue for executing work in process context.
- * @kernel_ctx: KMD context structure.
+ * @kernel_ctx: Kernel driver context structure.
  * @kernel_queues: array of hl_hw_queue.
  * @hw_queues_mirror_list: CS mirror list for TDR.
  * @hw_queues_mirror_lock: protects hw_queues_mirror_list.
  * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
  * @event_queue: event queue for IRQ from ArmCP.
  * @dma_pool: DMA pool for small allocations.
- * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address.
- * @cpu_accessible_dma_address: KMD <-> ArmCP shared memory DMA address.
- * @cpu_accessible_dma_pool: KMD <-> ArmCP shared memory pool.
+ * @cpu_accessible_dma_mem: Host <-> ArmCP shared memory CPU address.
+ * @cpu_accessible_dma_address: Host <-> ArmCP shared memory DMA address.
+ * @cpu_accessible_dma_pool: Host <-> ArmCP shared memory pool.
  * @asid_bitmap: holds used/available ASIDs.
  * @asid_mutex: protects asid_bitmap.
- * @fd_open_cnt_lock: lock for updating fd_open_cnt in hl_device_open. Although
- *                    fd_open_cnt is atomic, we need this lock to serialize
- *                    the open function because the driver currently supports
- *                    only a single process at a time. In addition, we need a
- *                    lock here so we can flush user processes which are opening
- *                    the device while we are trying to hard reset it
- * @send_cpu_message_lock: enforces only one message in KMD <-> ArmCP queue.
+ * @send_cpu_message_lock: enforces only one message in Host <-> ArmCP queue.
  * @debug_lock: protects critical section of setting debug mode for device
  * @asic_prop: ASIC specific immutable properties.
  * @asic_funcs: ASIC specific functions.
@@ -1189,22 +1212,28 @@ struct hl_device_reset_work {
  * @hl_debugfs: device's debugfs manager.
  * @cb_pool: list of preallocated CBs.
  * @cb_pool_lock: protects the CB pool.
- * @user_ctx: current user context executing.
+ * @fpriv_list: list of file private data structures. Each structure is created
+ *              when a user opens the device
+ * @fpriv_list_lock: protects the fpriv_list
+ * @compute_ctx: current compute context executing.
+ * @idle_busy_ts_arr: array to hold time stamps of transitions from idle to busy
+ *                    and vice-versa
  * @dram_used_mem: current DRAM memory consumption.
  * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
- *             value is saved so in case of hard-reset, KMD will restore this
- *             value and update the F/W after the re-initialization
+ *             value is saved so in case of hard-reset, the driver will restore
+ *             this value and update the F/W after the re-initialization
  * @in_reset: is device in reset flow.
  * @curr_pll_profile: current PLL profile.
- * @fd_open_cnt: number of open user processes.
  * @cs_active_cnt: number of active command submissions on this device (active
  *                 means already in H/W queues)
- * @major: habanalabs KMD major.
+ * @major: habanalabs kernel driver major.
  * @high_pll: high PLL profile frequency.
- * @soft_reset_cnt: number of soft reset since KMD loading.
- * @hard_reset_cnt: number of hard reset since KMD loading.
+ * @soft_reset_cnt: number of soft reset since the driver was loaded.
+ * @hard_reset_cnt: number of hard reset since the driver was loaded.
+ * @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr
  * @id: device minor.
+ * @id_control: minor of the control device
  * @disabled: is device disabled.
  * @late_init_done: is late init stage was done during initialization.
  * @hwmon_initialized: is H/W monitor sensors was initialized.
@@ -1218,15 +1247,18 @@ struct hl_device_reset_work {
  * @mmu_enable: is MMU enabled.
  * @device_cpu_disabled: is the device CPU disabled (due to timeouts)
  * @dma_mask: the dma mask that was set for this device
- * @in_debug: is device under debug. This, together with fd_open_cnt, enforces
+ * @in_debug: is device under debug. This, together with fpriv_list, enforces
  *            that only a single user is configuring the debug infrastructure.
+ * @cdev_sysfs_created: were char devices and sysfs nodes created.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
 	void __iomem			*pcie_bar[6];
 	void __iomem			*rmmio;
 	struct cdev			cdev;
+	struct cdev			cdev_ctrl;
 	struct device			*dev;
+	struct device			*dev_ctrl;
 	struct delayed_work		work_freq;
 	struct delayed_work		work_heartbeat;
 	char				asic_name[16];
@@ -1246,8 +1278,6 @@ struct hl_device {
 	struct gen_pool			*cpu_accessible_dma_pool;
 	unsigned long			*asid_bitmap;
 	struct mutex			asid_mutex;
-	/* TODO: remove fd_open_cnt_lock for multiple process support */
-	struct mutex			fd_open_cnt_lock;
 	struct mutex			send_cpu_message_lock;
 	struct mutex			debug_lock;
 	struct asic_fixed_properties	asic_prop;
@@ -1266,21 +1296,26 @@ struct hl_device {
 	struct list_head		cb_pool;
 	spinlock_t			cb_pool_lock;
 
-	/* TODO: remove user_ctx for multiple process support */
-	struct hl_ctx			*user_ctx;
+	struct list_head		fpriv_list;
+	struct mutex			fpriv_list_lock;
+
+	struct hl_ctx			*compute_ctx;
+
+	struct hl_device_idle_busy_ts	*idle_busy_ts_arr;
 
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
 	atomic_t			in_reset;
-	atomic_t			curr_pll_profile;
-	atomic_t			fd_open_cnt;
-	atomic_t			cs_active_cnt;
+	enum hl_pll_frequency		curr_pll_profile;
+	int				cs_active_cnt;
 	u32				major;
 	u32				high_pll;
 	u32				soft_reset_cnt;
 	u32				hard_reset_cnt;
+	u32				idle_busy_ts_idx;
 	u16				id;
+	u16				id_control;
 	u8				disabled;
 	u8				late_init_done;
 	u8				hwmon_initialized;
@@ -1293,6 +1328,7 @@ struct hl_device {
 	u8				device_cpu_disabled;
 	u8				dma_mask;
 	u8				in_debug;
+	u8				cdev_sysfs_created;
 
 	/* Parameters for bring-up */
 	u8				mmu_enable;
@@ -1386,6 +1422,7 @@ static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
 }
 
 int hl_device_open(struct inode *inode, struct file *filp);
+int hl_device_open_ctrl(struct inode *inode, struct file *filp);
 bool hl_device_disabled_or_in_reset(struct hl_device *hdev);
 enum hl_device_status hl_device_status(struct hl_device *hdev);
 int hl_device_set_debug_mode(struct hl_device *hdev, bool enable);
@@ -1439,6 +1476,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 void hl_hpriv_get(struct hl_fpriv *hpriv);
 void hl_hpriv_put(struct hl_fpriv *hpriv);
 int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
+uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms);
 
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
 		struct armcp_sensor *sensors_arr);
@@ -1625,6 +1663,7 @@ static inline void hl_debugfs_remove_ctx_mem_hash(struct hl_device *hdev,
 
 /* IOCTLs */
 long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+long hl_ioctl_control(struct file *filep, unsigned int cmd, unsigned long arg);
 int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data);
 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data);
 int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data);
diff --git a/drivers/misc/habanalabs/habanalabs_drv.c b/drivers/misc/habanalabs/habanalabs_drv.c
index 6f6dbe93f1df..8c342fb499ca 100644
--- a/drivers/misc/habanalabs/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/habanalabs_drv.c
@@ -95,80 +95,127 @@ int hl_device_open(struct inode *inode, struct file *filp)
 		return -ENXIO;
 	}
 
-	mutex_lock(&hdev->fd_open_cnt_lock);
+	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
+	if (!hpriv)
+		return -ENOMEM;
+
+	hpriv->hdev = hdev;
+	filp->private_data = hpriv;
+	hpriv->filp = filp;
+	mutex_init(&hpriv->restore_phase_mutex);
+	kref_init(&hpriv->refcount);
+	nonseekable_open(inode, filp);
+
+	hl_cb_mgr_init(&hpriv->cb_mgr);
+	hl_ctx_mgr_init(&hpriv->ctx_mgr);
+
+	hpriv->taskpid = find_get_pid(current->pid);
+
+	mutex_lock(&hdev->fpriv_list_lock);
 
 	if (hl_device_disabled_or_in_reset(hdev)) {
 		dev_err_ratelimited(hdev->dev,
 			"Can't open %s because it is disabled or in reset\n",
 			dev_name(hdev->dev));
-		mutex_unlock(&hdev->fd_open_cnt_lock);
-		return -EPERM;
+		rc = -EPERM;
+		goto out_err;
 	}
 
 	if (hdev->in_debug) {
 		dev_err_ratelimited(hdev->dev,
 			"Can't open %s because it is being debugged by another user\n",
 			dev_name(hdev->dev));
-		mutex_unlock(&hdev->fd_open_cnt_lock);
-		return -EPERM;
+		rc = -EPERM;
+		goto out_err;
 	}
 
-	if (atomic_read(&hdev->fd_open_cnt)) {
-		dev_info_ratelimited(hdev->dev,
+	if (hdev->compute_ctx) {
+		dev_dbg_ratelimited(hdev->dev,
 			"Can't open %s because another user is working on it\n",
 			dev_name(hdev->dev));
-		mutex_unlock(&hdev->fd_open_cnt_lock);
-		return -EBUSY;
-	}
-
-	atomic_inc(&hdev->fd_open_cnt);
-
-	mutex_unlock(&hdev->fd_open_cnt_lock);
-
-	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
-	if (!hpriv) {
-		rc = -ENOMEM;
-		goto close_device;
+		rc = -EBUSY;
+		goto out_err;
 	}
 
-	hpriv->hdev = hdev;
-	filp->private_data = hpriv;
-	hpriv->filp = filp;
-	mutex_init(&hpriv->restore_phase_mutex);
-	kref_init(&hpriv->refcount);
-	nonseekable_open(inode, filp);
-
-	hl_cb_mgr_init(&hpriv->cb_mgr);
-	hl_ctx_mgr_init(&hpriv->ctx_mgr);
-
 	rc = hl_ctx_create(hdev, hpriv);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to open FD (CTX fail)\n");
+		dev_err(hdev->dev, "Failed to create context %d\n", rc);
 		goto out_err;
 	}
 
-	hpriv->taskpid = find_get_pid(current->pid);
-
-	/*
-	 * Device is IDLE at this point so it is legal to change PLLs. There
-	 * is no need to check anything because if the PLL is already HIGH, the
-	 * set function will return without doing anything
+	/* Device is IDLE at this point so it is legal to change PLLs.
+	 * There is no need to check anything because if the PLL is
+	 * already HIGH, the set function will return without doing
+	 * anything
 	 */
 	hl_device_set_frequency(hdev, PLL_HIGH);
 
+	list_add(&hpriv->dev_node, &hdev->fpriv_list);
+	mutex_unlock(&hdev->fpriv_list_lock);
+
 	hl_debugfs_add_file(hpriv);
 
 	return 0;
 
 out_err:
-	filp->private_data = NULL;
-	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
+	mutex_unlock(&hdev->fpriv_list_lock);
+
 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
+	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
+	filp->private_data = NULL;
 	mutex_destroy(&hpriv->restore_phase_mutex);
+	put_pid(hpriv->taskpid);
+
 	kfree(hpriv);
+	return rc;
+}
+
+int hl_device_open_ctrl(struct inode *inode, struct file *filp)
+{
+	struct hl_device *hdev;
+	struct hl_fpriv *hpriv;
+	int rc;
+
+	mutex_lock(&hl_devs_idr_lock);
+	hdev = idr_find(&hl_devs_idr, iminor(inode));
+	mutex_unlock(&hl_devs_idr_lock);
+
+	if (!hdev) {
+		pr_err("Couldn't find device %d:%d\n",
+			imajor(inode), iminor(inode));
+		return -ENXIO;
+	}
+
+	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
+	if (!hpriv)
+		return -ENOMEM;
+
+	mutex_lock(&hdev->fpriv_list_lock);
+
+	if (hl_device_disabled_or_in_reset(hdev)) {
+		dev_err_ratelimited(hdev->dev_ctrl,
+			"Can't open %s because it is disabled or in reset\n",
+			dev_name(hdev->dev_ctrl));
+		rc = -EPERM;
+		goto out_err;
+	}
 
-close_device:
-	atomic_dec(&hdev->fd_open_cnt);
+	list_add(&hpriv->dev_node, &hdev->fpriv_list);
+	mutex_unlock(&hdev->fpriv_list_lock);
+
+	hpriv->hdev = hdev;
+	filp->private_data = hpriv;
+	hpriv->filp = filp;
+	hpriv->is_control = true;
+	nonseekable_open(inode, filp);
+
+	hpriv->taskpid = find_get_pid(current->pid);
+
+	return 0;
+
+out_err:
+	mutex_unlock(&hdev->fpriv_list_lock);
+	kfree(hpriv);
 	return rc;
 }
 
@@ -199,7 +246,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 		enum hl_asic_type asic_type, int minor)
 {
 	struct hl_device *hdev;
-	int rc;
+	int rc, main_id, ctrl_id = 0;
 
 	*dev = NULL;
 
@@ -240,33 +287,34 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 
 	mutex_lock(&hl_devs_idr_lock);
 
-	if (minor == -1) {
-		rc = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
+	/* Always save 2 numbers, 1 for main device and 1 for control.
+	 * They must be consecutive
+	 */
+	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
 				GFP_KERNEL);
-	} else {
-		void *old_idr = idr_replace(&hl_devs_idr, hdev, minor);
 
-		if (IS_ERR_VALUE(old_idr)) {
-			rc = PTR_ERR(old_idr);
-			pr_err("Error %d when trying to replace minor %d\n",
-				rc, minor);
-			mutex_unlock(&hl_devs_idr_lock);
-			goto free_hdev;
-		}
-		rc = minor;
-	}
+	if (main_id >= 0)
+		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
+					main_id + 2, GFP_KERNEL);
 
 	mutex_unlock(&hl_devs_idr_lock);
 
-	if (rc < 0) {
-		if (rc == -ENOSPC) {
+	if ((main_id < 0) || (ctrl_id < 0)) {
+		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
 			pr_err("too many devices in the system\n");
-			rc = -EBUSY;
+
+		if (main_id >= 0) {
+			mutex_lock(&hl_devs_idr_lock);
+			idr_remove(&hl_devs_idr, main_id);
+			mutex_unlock(&hl_devs_idr_lock);
 		}
+
+		rc = -EBUSY;
 		goto free_hdev;
 	}
 
-	hdev->id = rc;
+	hdev->id = main_id;
+	hdev->id_control = ctrl_id;
 
 	*dev = hdev;
 
@@ -288,6 +336,7 @@ void destroy_hdev(struct hl_device *hdev)
 	/* Remove device from the device list */
 	mutex_lock(&hl_devs_idr_lock);
 	idr_remove(&hl_devs_idr, hdev->id);
+	idr_remove(&hl_devs_idr, hdev->id_control);
 	mutex_unlock(&hl_devs_idr_lock);
 
 	kfree(hdev);
@@ -295,8 +344,7 @@ void destroy_hdev(struct hl_device *hdev)
 
 static int hl_pmops_suspend(struct device *dev)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct hl_device *hdev = pci_get_drvdata(pdev);
+	struct hl_device *hdev = dev_get_drvdata(dev);
 
 	pr_debug("Going to suspend PCI device\n");
 
@@ -310,8 +358,7 @@ static int hl_pmops_suspend(struct device *dev)
 
 static int hl_pmops_resume(struct device *dev)
 {
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct hl_device *hdev = pci_get_drvdata(pdev);
+	struct hl_device *hdev = dev_get_drvdata(dev);
 
 	pr_debug("Going to resume PCI device\n");
 
diff --git a/drivers/misc/habanalabs/habanalabs_ioctl.c b/drivers/misc/habanalabs/habanalabs_ioctl.c
index 07127576b3e8..66d9c710073c 100644
--- a/drivers/misc/habanalabs/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/habanalabs_ioctl.c
@@ -65,7 +65,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.num_of_events = prop->num_of_events;
 	memcpy(hw_ip.armcp_version,
 		prop->armcp_info.armcp_version, VERSION_MAX_LEN);
-	hw_ip.armcp_cpld_version = __le32_to_cpu(prop->armcp_info.cpld_version);
+	hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version);
 	hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr;
 	hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf;
 	hw_ip.psoc_pci_pll_od = prop->psoc_pci_pll_od;
@@ -75,7 +75,8 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 		min((size_t)size, sizeof(hw_ip))) ? -EFAULT : 0;
 }
 
-static int hw_events_info(struct hl_device *hdev, struct hl_info_args *args)
+static int hw_events_info(struct hl_device *hdev, bool aggregate,
+			struct hl_info_args *args)
 {
 	u32 size, max_size = args->return_size;
 	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
@@ -84,13 +85,14 @@ static int hw_events_info(struct hl_device *hdev, struct hl_info_args *args)
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	arr = hdev->asic_funcs->get_events_stat(hdev, &size);
+	arr = hdev->asic_funcs->get_events_stat(hdev, aggregate, &size);
 
 	return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0;
 }
 
-static int dram_usage_info(struct hl_device *hdev, struct hl_info_args *args)
+static int dram_usage_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
+	struct hl_device *hdev = hpriv->hdev;
 	struct hl_info_dram_usage dram_usage = {0};
 	u32 max_size = args->return_size;
 	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
@@ -104,7 +106,9 @@ static int dram_usage_info(struct hl_device *hdev, struct hl_info_args *args)
 				prop->dram_base_address);
 	dram_usage.dram_free_mem = (prop->dram_size - dram_kmd_size) -
 					atomic64_read(&hdev->dram_used_mem);
-	dram_usage.ctx_dram_mem = atomic64_read(&hdev->user_ctx->dram_phys_mem);
+	if (hpriv->ctx)
+		dram_usage.ctx_dram_mem =
+			atomic64_read(&hpriv->ctx->dram_phys_mem);
 
 	return copy_to_user(out, &dram_usage,
 		min((size_t) max_size, sizeof(dram_usage))) ? -EFAULT : 0;
@@ -141,13 +145,16 @@ static int debug_coresight(struct hl_device *hdev, struct hl_debug_args *args)
 	params->op = args->op;
 
 	if (args->input_ptr && args->input_size) {
-		input = memdup_user(u64_to_user_ptr(args->input_ptr),
-					args->input_size);
-		if (IS_ERR(input)) {
-			rc = PTR_ERR(input);
-			input = NULL;
-			dev_err(hdev->dev,
-				"error %d when copying input debug data\n", rc);
+		input = kzalloc(hl_debug_struct_size[args->op], GFP_KERNEL);
+		if (!input) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		if (copy_from_user(input, u64_to_user_ptr(args->input_ptr),
+					args->input_size)) {
+			rc = -EFAULT;
+			dev_err(hdev->dev, "failed to copy input debug data\n");
 			goto out;
 		}
 
@@ -191,42 +198,81 @@ out:
 	return rc;
 }
 
-static int hl_info_ioctl(struct hl_fpriv *hpriv, void *data)
+static int device_utilization(struct hl_device *hdev, struct hl_info_args *args)
+{
+	struct hl_info_device_utilization device_util = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	if ((args->period_ms < 100) || (args->period_ms > 1000) ||
+		(args->period_ms % 100)) {
+		dev_err(hdev->dev,
+			"period %u must be between 100 - 1000 and must be divisible by 100\n",
+			args->period_ms);
+		return -EINVAL;
+	}
+
+	device_util.utilization = hl_device_utilization(hdev, args->period_ms);
+
+	return copy_to_user(out, &device_util,
+		min((size_t) max_size, sizeof(device_util))) ? -EFAULT : 0;
+}
+
+static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
+				struct device *dev)
 {
 	struct hl_info_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
 	int rc;
 
-	/* We want to return device status even if it disabled or in reset */
-	if (args->op == HL_INFO_DEVICE_STATUS)
+	/*
+	 * Information is returned for the following opcodes even if the device
+	 * is disabled or in reset.
+	 */
+	switch (args->op) {
+	case HL_INFO_HW_IP_INFO:
+		return hw_ip_info(hdev, args);
+
+	case HL_INFO_DEVICE_STATUS:
 		return device_status_info(hdev, args);
 
+	default:
+		break;
+	}
+
 	if (hl_device_disabled_or_in_reset(hdev)) {
-		dev_warn_ratelimited(hdev->dev,
+		dev_warn_ratelimited(dev,
 			"Device is %s. Can't execute INFO IOCTL\n",
 			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
 		return -EBUSY;
 	}
 
 	switch (args->op) {
-	case HL_INFO_HW_IP_INFO:
-		rc = hw_ip_info(hdev, args);
-		break;
-
 	case HL_INFO_HW_EVENTS:
-		rc = hw_events_info(hdev, args);
+		rc = hw_events_info(hdev, false, args);
 		break;
 
 	case HL_INFO_DRAM_USAGE:
-		rc = dram_usage_info(hdev, args);
+		rc = dram_usage_info(hpriv, args);
 		break;
 
 	case HL_INFO_HW_IDLE:
 		rc = hw_idle(hdev, args);
 		break;
 
+	case HL_INFO_DEVICE_UTILIZATION:
+		rc = device_utilization(hdev, args);
+		break;
+
+	case HL_INFO_HW_EVENTS_AGGREGATE:
+		rc = hw_events_info(hdev, true, args);
+		break;
+
 	default:
-		dev_err(hdev->dev, "Invalid request %d\n", args->op);
+		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;
 		break;
 	}
@@ -234,6 +280,16 @@ static int hl_info_ioctl(struct hl_fpriv *hpriv, void *data)
 	return rc;
 }
 
+static int hl_info_ioctl(struct hl_fpriv *hpriv, void *data)
+{
+	return _hl_info_ioctl(hpriv, data, hpriv->hdev->dev);
+}
+
+static int hl_info_ioctl_control(struct hl_fpriv *hpriv, void *data)
+{
+	return _hl_info_ioctl(hpriv, data, hpriv->hdev->dev_ctrl);
+}
+
 static int hl_debug_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	struct hl_debug_args *args = data;
@@ -288,52 +344,45 @@ static const struct hl_ioctl_desc hl_ioctls[] = {
 	HL_IOCTL_DEF(HL_IOCTL_DEBUG, hl_debug_ioctl)
 };
 
-#define HL_CORE_IOCTL_COUNT	ARRAY_SIZE(hl_ioctls)
+static const struct hl_ioctl_desc hl_ioctls_control[] = {
+	HL_IOCTL_DEF(HL_IOCTL_INFO, hl_info_ioctl_control)
+};
 
-long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg,
+		const struct hl_ioctl_desc *ioctl, struct device *dev)
 {
 	struct hl_fpriv *hpriv = filep->private_data;
 	struct hl_device *hdev = hpriv->hdev;
-	hl_ioctl_t *func;
-	const struct hl_ioctl_desc *ioctl = NULL;
 	unsigned int nr = _IOC_NR(cmd);
 	char stack_kdata[128] = {0};
 	char *kdata = NULL;
 	unsigned int usize, asize;
+	hl_ioctl_t *func;
+	u32 hl_size;
 	int retcode;
 
 	if (hdev->hard_reset_pending) {
-		dev_crit_ratelimited(hdev->dev,
+		dev_crit_ratelimited(hdev->dev_ctrl,
 			"Device HARD reset pending! Please close FD\n");
 		return -ENODEV;
 	}
 
-	if ((nr >= HL_COMMAND_START) && (nr < HL_COMMAND_END)) {
-		u32 hl_size;
-
-		ioctl = &hl_ioctls[nr];
-
-		hl_size = _IOC_SIZE(ioctl->cmd);
-		usize = asize = _IOC_SIZE(cmd);
-		if (hl_size > asize)
-			asize = hl_size;
-
-		cmd = ioctl->cmd;
-	} else {
-		dev_err(hdev->dev, "invalid ioctl: pid=%d, nr=0x%02x\n",
-			  task_pid_nr(current), nr);
-		return -ENOTTY;
-	}
-
 	/* Do not trust userspace, use our own definition */
 	func = ioctl->func;
 
 	if (unlikely(!func)) {
-		dev_dbg(hdev->dev, "no function\n");
+		dev_dbg(dev, "no function\n");
 		retcode = -ENOTTY;
 		goto out_err;
 	}
 
+	hl_size = _IOC_SIZE(ioctl->cmd);
+	usize = asize = _IOC_SIZE(cmd);
+	if (hl_size > asize)
+		asize = hl_size;
+
+	cmd = ioctl->cmd;
+
 	if (cmd & (IOC_IN | IOC_OUT)) {
 		if (asize <= sizeof(stack_kdata)) {
 			kdata = stack_kdata;
@@ -363,8 +412,7 @@ long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 
 out_err:
 	if (retcode)
-		dev_dbg(hdev->dev,
-			"error in ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n",
+		dev_dbg(dev, "error in ioctl: pid=%d, cmd=0x%02x, nr=0x%02x\n",
 			  task_pid_nr(current), cmd, nr);
 
 	if (kdata != stack_kdata)
@@ -372,3 +420,39 @@ out_err:
 
 	return retcode;
 }
+
+long hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct hl_fpriv *hpriv = filep->private_data;
+	struct hl_device *hdev = hpriv->hdev;
+	const struct hl_ioctl_desc *ioctl = NULL;
+	unsigned int nr = _IOC_NR(cmd);
+
+	if ((nr >= HL_COMMAND_START) && (nr < HL_COMMAND_END)) {
+		ioctl = &hl_ioctls[nr];
+	} else {
+		dev_err(hdev->dev, "invalid ioctl: pid=%d, nr=0x%02x\n",
+			task_pid_nr(current), nr);
+		return -ENOTTY;
+	}
+
+	return _hl_ioctl(filep, cmd, arg, ioctl, hdev->dev);
+}
+
+long hl_ioctl_control(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct hl_fpriv *hpriv = filep->private_data;
+	struct hl_device *hdev = hpriv->hdev;
+	const struct hl_ioctl_desc *ioctl = NULL;
+	unsigned int nr = _IOC_NR(cmd);
+
+	if (nr == _IOC_NR(HL_IOCTL_INFO)) {
+		ioctl = &hl_ioctls_control[nr];
+	} else {
+		dev_err(hdev->dev_ctrl, "invalid ioctl: pid=%d, nr=0x%02x\n",
+			task_pid_nr(current), nr);
+		return -ENOTTY;
+	}
+
+	return _hl_ioctl(filep, cmd, arg, ioctl, hdev->dev_ctrl);
+}
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index 5f5673b74985..55b383b2a116 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -80,9 +80,9 @@ static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
 
 	bd = (struct hl_bd *) (uintptr_t) q->kernel_address;
 	bd += hl_pi_2_offset(q->pi);
-	bd->ctl = __cpu_to_le32(ctl);
-	bd->len = __cpu_to_le32(len);
-	bd->ptr = __cpu_to_le64(ptr);
+	bd->ctl = cpu_to_le32(ctl);
+	bd->len = cpu_to_le32(len);
+	bd->ptr = cpu_to_le64(ptr);
 
 	q->pi = hl_queue_inc_ptr(q->pi);
 	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
@@ -249,7 +249,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
 	len = job->job_cb_size;
 	ptr = cb->bus_address;
 
-	cq_pkt.data = __cpu_to_le32(
+	cq_pkt.data = cpu_to_le32(
 				((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
 					& CQ_ENTRY_SHADOW_INDEX_MASK) |
 				(1 << CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT) |
@@ -267,7 +267,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
 
 	hdev->asic_funcs->add_end_of_cb_packets(hdev, cb->kernel_address, len,
 						cq_addr,
-						__le32_to_cpu(cq_pkt.data),
+						le32_to_cpu(cq_pkt.data),
 						q->hw_queue_id);
 
 	q->shadow_queue[hl_pi_2_offset(q->pi)] = job;
@@ -364,7 +364,13 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 		spin_unlock(&hdev->hw_queues_mirror_lock);
 	}
 
-	atomic_inc(&hdev->cs_active_cnt);
+	if (!hdev->cs_active_cnt++) {
+		struct hl_device_idle_busy_ts *ts;
+
+		ts = &hdev->idle_busy_ts_arr[hdev->idle_busy_ts_idx];
+		ts->busy_to_idle_ts = ktime_set(0, 0);
+		ts->idle_to_busy_ts = ktime_get();
+	}
 
 	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
 		if (job->ext_queue)
diff --git a/drivers/misc/habanalabs/hwmon.c b/drivers/misc/habanalabs/hwmon.c
index 77facd25c4a2..7be4bace9b4f 100644
--- a/drivers/misc/habanalabs/hwmon.c
+++ b/drivers/misc/habanalabs/hwmon.c
@@ -26,7 +26,7 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
 	int rc, i, j;
 
 	for (i = 0 ; i < ARMCP_MAX_SENSORS ; i++) {
-		type = __le32_to_cpu(sensors_arr[i].type);
+		type = le32_to_cpu(sensors_arr[i].type);
 
 		if ((type == 0) && (sensors_arr[i].flags == 0))
 			break;
@@ -58,10 +58,10 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
 	}
 
 	for (i = 0 ; i < arr_size ; i++) {
-		type = __le32_to_cpu(sensors_arr[i].type);
+		type = le32_to_cpu(sensors_arr[i].type);
 		curr_arr = sensors_by_type[type];
 		curr_arr[sensors_by_type_next_index[type]++] =
-				__le32_to_cpu(sensors_arr[i].flags);
+				le32_to_cpu(sensors_arr[i].flags);
 	}
 
 	channels_info = kcalloc(num_active_sensor_types + 1,
@@ -273,7 +273,7 @@ long hl_get_temperature(struct hl_device *hdev, int sensor_index, u32 attr)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_TEMPERATURE_GET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_GET <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
@@ -299,7 +299,7 @@ long hl_get_voltage(struct hl_device *hdev, int sensor_index, u32 attr)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_VOLTAGE_GET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_VOLTAGE_GET <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
@@ -325,7 +325,7 @@ long hl_get_current(struct hl_device *hdev, int sensor_index, u32 attr)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_CURRENT_GET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_CURRENT_GET <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
@@ -351,7 +351,7 @@ long hl_get_fan_speed(struct hl_device *hdev, int sensor_index, u32 attr)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_FAN_SPEED_GET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_FAN_SPEED_GET <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
@@ -377,7 +377,7 @@ long hl_get_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_PWM_GET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_GET <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
@@ -403,11 +403,11 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_PWM_SET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_SET <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
-	pkt.value = __cpu_to_le64(value);
+	pkt.value = cpu_to_le64(value);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 					SENSORS_PKT_TIMEOUT, NULL);
@@ -421,6 +421,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 int hl_hwmon_init(struct hl_device *hdev)
 {
 	struct device *dev = hdev->pdev ? &hdev->pdev->dev : hdev->dev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int rc;
 
 	if ((hdev->hwmon_initialized) || !(hdev->fw_loading))
@@ -430,7 +431,8 @@ int hl_hwmon_init(struct hl_device *hdev)
 		hdev->hl_chip_info->ops = &hl_hwmon_ops;
 
 		hdev->hwmon_dev = hwmon_device_register_with_info(dev,
-				"habanalabs", hdev, hdev->hl_chip_info, NULL);
+					prop->armcp_info.card_name, hdev,
+					hdev->hl_chip_info, NULL);
 		if (IS_ERR(hdev->hwmon_dev)) {
 			rc = PTR_ERR(hdev->hwmon_dev);
 			dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/include/armcp_if.h b/drivers/misc/habanalabs/include/armcp_if.h
index 1f1e35e86d84..e4c6699a1868 100644
--- a/drivers/misc/habanalabs/include/armcp_if.h
+++ b/drivers/misc/habanalabs/include/armcp_if.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright 2016-2018 HabanaLabs, Ltd.
+ * Copyright 2016-2019 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -41,33 +41,34 @@ enum pq_init_status {
 /*
  * ArmCP Primary Queue Packets
  *
- * During normal operation, KMD needs to send various messages to ArmCP,
- * usually either to SET some value into a H/W periphery or to GET the current
- * value of some H/W periphery. For example, SET the frequency of MME/TPC and
- * GET the value of the thermal sensor.
- *
- * These messages can be initiated either by the User application or by KMD
- * itself, e.g. power management code. In either case, the communication from
- * KMD to ArmCP will *always* be in synchronous mode, meaning that KMD will
- * send a single message and poll until the message was acknowledged and the
- * results are ready (if results are needed).
- *
- * This means that only a single message can be sent at a time and KMD must
- * wait for its result before sending the next message. Having said that,
- * because these are control messages which are sent in a relatively low
+ * During normal operation, the host's kernel driver needs to send various
+ * messages to ArmCP, usually either to SET some value into a H/W periphery or
+ * to GET the current value of some H/W periphery. For example, SET the
+ * frequency of MME/TPC and GET the value of the thermal sensor.
+ *
+ * These messages can be initiated either by the User application or by the
+ * host's driver itself, e.g. power management code. In either case, the
+ * communication from the host's driver to ArmCP will *always* be in
+ * synchronous mode, meaning that the host will send a single message and poll
+ * until the message was acknowledged and the results are ready (if results are
+ * needed).
+ *
+ * This means that only a single message can be sent at a time and the host's
+ * driver must wait for its result before sending the next message. Having said
+ * that, because these are control messages which are sent in a relatively low
  * frequency, this limitation seems acceptable. It's important to note that
  * in case of multiple devices, messages to different devices *can* be sent
  * at the same time.
  *
  * The message, inputs/outputs (if relevant) and fence object will be located
- * on the device DDR at an address that will be determined by KMD. During
- * device initialization phase, KMD will pass to ArmCP that address.  Most of
- * the message types will contain inputs/outputs inside the message itself.
- * The common part of each message will contain the opcode of the message (its
- * type) and a field representing a fence object.
- *
- * When KMD wishes to send a message to ArmCP, it will write the message
- * contents to the device DDR, clear the fence object and then write the
+ * on the device DDR at an address that will be determined by the host's driver.
+ * During device initialization phase, the host will pass to ArmCP that address.
+ * Most of the message types will contain inputs/outputs inside the message
+ * itself. The common part of each message will contain the opcode of the
+ * message (its type) and a field representing a fence object.
+ *
+ * When the host's driver wishes to send a message to ArmCP, it will write the
+ * message contents to the device DDR, clear the fence object and then write the
  * value 484 to the mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR register to issue
  * the 484 interrupt-id to the ARM core.
  *
@@ -78,12 +79,13 @@ enum pq_init_status {
  * device DDR and then write to the fence object. If an error occurred, ArmCP
  * will fill the rc field with the right error code.
  *
- * In the meantime, KMD will poll on the fence object. Once KMD sees that the
- * fence object is signaled, it will read the results from the device DDR
- * (if relevant) and resume the code execution in KMD.
+ * In the meantime, the host's driver will poll on the fence object. Once the
+ * host sees that the fence object is signaled, it will read the results from
+ * the device DDR (if relevant) and resume the code execution in the host's
+ * driver.
  *
  * To use QMAN packets, the opcode must be the QMAN opcode, shifted by 8
- * so the value being put by the KMD matches the value read by ArmCP
+ * so the value being put by the host's driver matches the value read by ArmCP
  *
  * Non-QMAN packets should be limited to values 1 through (2^8 - 1)
  *
@@ -148,9 +150,9 @@ enum pq_init_status {
  *
  * ARMCP_PACKET_INFO_GET -
  *       Fetch information from the device as specified in the packet's
- *       structure. KMD passes the max size it allows the ArmCP to write to
- *       the structure, to prevent data corruption in case of mismatched
- *       KMD/FW versions.
+ *       structure. The host's driver passes the max size it allows the ArmCP to
+ *       write to the structure, to prevent data corruption in case of
+ *       mismatched driver/FW versions.
  *
  * ARMCP_PACKET_FLASH_PROGRAM_REMOVED - this packet was removed
  *
@@ -183,9 +185,9 @@ enum pq_init_status {
  * ARMCP_PACKET_EEPROM_DATA_GET -
  *       Get EEPROM data from the ArmCP kernel. The buffer is specified in the
  *       addr field. The CPU will put the returned data size in the result
- *       field. In addition, KMD passes the max size it allows the ArmCP to
- *       write to the structure, to prevent data corruption in case of
- *       mismatched KMD/FW versions.
+ *       field. In addition, the host's driver passes the max size it allows the
+ *       ArmCP to write to the structure, to prevent data corruption in case of
+ *       mismatched driver/FW versions.
  *
  */
 
@@ -231,7 +233,7 @@ struct armcp_packet {
 
 	__le32 ctl;
 
-	__le32 fence;		/* Signal to KMD that message is completed */
+	__le32 fence;		/* Signal to host that message is completed */
 
 	union {
 		struct {/* For temperature/current/voltage/fan/pwm get/set */
@@ -310,6 +312,7 @@ struct eq_generic_event {
  * ArmCP info
  */
 
+#define CARD_NAME_MAX_LEN		16
 #define VERSION_MAX_LEN			128
 #define ARMCP_MAX_SENSORS		128
 
@@ -318,6 +321,19 @@ struct armcp_sensor {
 	__le32 flags;
 };
 
+/**
+ * struct armcp_info - Info from ArmCP that is necessary to the host's driver
+ * @sensors: available sensors description.
+ * @kernel_version: ArmCP linux kernel version.
+ * @reserved: reserved field.
+ * @cpld_version: CPLD programmed F/W version.
+ * @infineon_version: Infineon main DC-DC version.
+ * @fuse_version: silicon production FUSE information.
+ * @thermal_version: thermald S/W version.
+ * @armcp_version: ArmCP S/W version.
+ * @dram_size: available DRAM size.
+ * @card_name: card name that will be displayed in HWMON subsystem on the host
+ */
 struct armcp_info {
 	struct armcp_sensor sensors[ARMCP_MAX_SENSORS];
 	__u8 kernel_version[VERSION_MAX_LEN];
@@ -328,6 +344,7 @@ struct armcp_info {
 	__u8 thermal_version[VERSION_MAX_LEN];
 	__u8 armcp_version[VERSION_MAX_LEN];
 	__le64 dram_size;
+	char card_name[CARD_NAME_MAX_LEN];
 };
 
 #endif /* ARMCP_IF_H */
diff --git a/drivers/misc/habanalabs/include/goya/goya.h b/drivers/misc/habanalabs/include/goya/goya.h
index 3f02a52ba4ce..43d241891e45 100644
--- a/drivers/misc/habanalabs/include/goya/goya.h
+++ b/drivers/misc/habanalabs/include/goya/goya.h
@@ -38,4 +38,6 @@
 
 #define TPC_MAX_NUM		8
 
+#define MME_MAX_NUM		1
+
 #endif /* GOYA_H */
diff --git a/drivers/misc/habanalabs/include/goya/goya_reg_map.h b/drivers/misc/habanalabs/include/goya/goya_reg_map.h
new file mode 100644
index 000000000000..cd89723c7f61
--- /dev/null
+++ b/drivers/misc/habanalabs/include/goya/goya_reg_map.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright 2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ *
+ */
+
+#ifndef GOYA_REG_MAP_H_
+#define GOYA_REG_MAP_H_
+
+/*
+ * PSOC scratch-pad registers
+ */
+#define mmCPU_PQ_BASE_ADDR_LOW	mmPSOC_GLOBAL_CONF_SCRATCHPAD_0
+#define mmCPU_PQ_BASE_ADDR_HIGH	mmPSOC_GLOBAL_CONF_SCRATCHPAD_1
+#define mmCPU_EQ_BASE_ADDR_LOW	mmPSOC_GLOBAL_CONF_SCRATCHPAD_2
+#define mmCPU_EQ_BASE_ADDR_HIGH	mmPSOC_GLOBAL_CONF_SCRATCHPAD_3
+#define mmCPU_EQ_LENGTH		mmPSOC_GLOBAL_CONF_SCRATCHPAD_4
+#define mmCPU_PQ_LENGTH		mmPSOC_GLOBAL_CONF_SCRATCHPAD_5
+#define mmCPU_EQ_CI		mmPSOC_GLOBAL_CONF_SCRATCHPAD_6
+#define mmCPU_PQ_INIT_STATUS	mmPSOC_GLOBAL_CONF_SCRATCHPAD_7
+#define mmCPU_CQ_BASE_ADDR_LOW	mmPSOC_GLOBAL_CONF_SCRATCHPAD_8
+#define mmCPU_CQ_BASE_ADDR_HIGH	mmPSOC_GLOBAL_CONF_SCRATCHPAD_9
+#define mmCPU_CQ_LENGTH		mmPSOC_GLOBAL_CONF_SCRATCHPAD_10
+#define mmUPD_STS		mmPSOC_GLOBAL_CONF_SCRATCHPAD_26
+#define mmUPD_CMD		mmPSOC_GLOBAL_CONF_SCRATCHPAD_27
+#define mmPREBOOT_VER_OFFSET	mmPSOC_GLOBAL_CONF_SCRATCHPAD_28
+#define mmUBOOT_VER_OFFSET	mmPSOC_GLOBAL_CONF_SCRATCHPAD_29
+#define mmUBOOT_OFFSET		mmPSOC_GLOBAL_CONF_SCRATCHPAD_30
+#define mmBTL_ID		mmPSOC_GLOBAL_CONF_SCRATCHPAD_31
+
+#define mmHW_STATE		mmPSOC_GLOBAL_CONF_APP_STATUS
+
+#endif /* GOYA_REG_MAP_H_ */
diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c
index 199791b57caf..fac65fbd70e8 100644
--- a/drivers/misc/habanalabs/irq.c
+++ b/drivers/misc/habanalabs/irq.c
@@ -160,7 +160,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
 
 	while (1) {
 		bool entry_ready =
-			((__le32_to_cpu(eq_base[eq->ci].hdr.ctl) &
+			((le32_to_cpu(eq_base[eq->ci].hdr.ctl) &
 				EQ_CTL_READY_MASK) >> EQ_CTL_READY_SHIFT);
 
 		if (!entry_ready)
@@ -194,7 +194,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
 skip_irq:
 		/* Clear EQ entry ready bit */
 		eq_entry->hdr.ctl =
-			__cpu_to_le32(__le32_to_cpu(eq_entry->hdr.ctl) &
+			cpu_to_le32(le32_to_cpu(eq_entry->hdr.ctl) &
 							~EQ_CTL_READY_MASK);
 
 		eq->ci = hl_eq_inc_ptr(eq->ci);
diff --git a/drivers/misc/habanalabs/sysfs.c b/drivers/misc/habanalabs/sysfs.c
index 25eb46d29d88..4cd622b017b9 100644
--- a/drivers/misc/habanalabs/sysfs.c
+++ b/drivers/misc/habanalabs/sysfs.c
@@ -21,12 +21,12 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 	memset(&pkt, 0, sizeof(pkt));
 
 	if (curr)
-		pkt.ctl = __cpu_to_le32(ARMCP_PACKET_FREQUENCY_CURR_GET <<
+		pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_CURR_GET <<
 						ARMCP_PKT_CTL_OPCODE_SHIFT);
 	else
-		pkt.ctl = __cpu_to_le32(ARMCP_PACKET_FREQUENCY_GET <<
+		pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_GET <<
 						ARMCP_PKT_CTL_OPCODE_SHIFT);
-	pkt.pll_index = __cpu_to_le32(pll_index);
+	pkt.pll_index = cpu_to_le32(pll_index);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						SET_CLK_PKT_TIMEOUT, &result);
@@ -48,10 +48,10 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_FREQUENCY_SET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_SET <<
 					ARMCP_PKT_CTL_OPCODE_SHIFT);
-	pkt.pll_index = __cpu_to_le32(pll_index);
-	pkt.value = __cpu_to_le64(freq);
+	pkt.pll_index = cpu_to_le32(pll_index);
+	pkt.value = cpu_to_le64(freq);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 					SET_CLK_PKT_TIMEOUT, NULL);
@@ -70,7 +70,7 @@ u64 hl_get_max_power(struct hl_device *hdev)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_MAX_POWER_GET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_GET <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@@ -91,9 +91,9 @@ void hl_set_max_power(struct hl_device *hdev, u64 value)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = __cpu_to_le32(ARMCP_PACKET_MAX_POWER_SET <<
+	pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_SET <<
 				ARMCP_PKT_CTL_OPCODE_SHIFT);
-	pkt.value = __cpu_to_le64(value);
+	pkt.value = cpu_to_le64(value);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 					SET_PWR_PKT_TIMEOUT, NULL);
@@ -102,100 +102,6 @@ void hl_set_max_power(struct hl_device *hdev, u64 value)
 		dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);
 }
 
-static ssize_t pm_mng_profile_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct hl_device *hdev = dev_get_drvdata(dev);
-
-	if (hl_device_disabled_or_in_reset(hdev))
-		return -ENODEV;
-
-	return sprintf(buf, "%s\n",
-			(hdev->pm_mng_profile == PM_AUTO) ? "auto" :
-			(hdev->pm_mng_profile == PM_MANUAL) ? "manual" :
-			"unknown");
-}
-
-static ssize_t pm_mng_profile_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t count)
-{
-	struct hl_device *hdev = dev_get_drvdata(dev);
-
-	if (hl_device_disabled_or_in_reset(hdev)) {
-		count = -ENODEV;
-		goto out;
-	}
-
-	mutex_lock(&hdev->fd_open_cnt_lock);
-
-	if (atomic_read(&hdev->fd_open_cnt) > 0) {
-		dev_err(hdev->dev,
-			"Can't change PM profile while user process is opened on the device\n");
-		count = -EPERM;
-		goto unlock_mutex;
-	}
-
-	if (strncmp("auto", buf, strlen("auto")) == 0) {
-		/* Make sure we are in LOW PLL when changing modes */
-		if (hdev->pm_mng_profile == PM_MANUAL) {
-			atomic_set(&hdev->curr_pll_profile, PLL_HIGH);
-			hl_device_set_frequency(hdev, PLL_LOW);
-			hdev->pm_mng_profile = PM_AUTO;
-		}
-	} else if (strncmp("manual", buf, strlen("manual")) == 0) {
-		/* Make sure we are in LOW PLL when changing modes */
-		if (hdev->pm_mng_profile == PM_AUTO) {
-			flush_delayed_work(&hdev->work_freq);
-			hdev->pm_mng_profile = PM_MANUAL;
-		}
-	} else {
-		dev_err(hdev->dev, "value should be auto or manual\n");
-		count = -EINVAL;
-		goto unlock_mutex;
-	}
-
-unlock_mutex:
-	mutex_unlock(&hdev->fd_open_cnt_lock);
-out:
-	return count;
-}
-
-static ssize_t high_pll_show(struct device *dev, struct device_attribute *attr,
-				char *buf)
-{
-	struct hl_device *hdev = dev_get_drvdata(dev);
-
-	if (hl_device_disabled_or_in_reset(hdev))
-		return -ENODEV;
-
-	return sprintf(buf, "%u\n", hdev->high_pll);
-}
-
-static ssize_t high_pll_store(struct device *dev, struct device_attribute *attr,
-				const char *buf, size_t count)
-{
-	struct hl_device *hdev = dev_get_drvdata(dev);
-	long value;
-	int rc;
-
-	if (hl_device_disabled_or_in_reset(hdev)) {
-		count = -ENODEV;
-		goto out;
-	}
-
-	rc = kstrtoul(buf, 0, &value);
-
-	if (rc) {
-		count = -EINVAL;
-		goto out;
-	}
-
-	hdev->high_pll = value;
-
-out:
-	return count;
-}
-
 static ssize_t uboot_ver_show(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {
@@ -351,14 +257,6 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr,
 	return sprintf(buf, "%s\n", str);
 }
 
-static ssize_t write_open_cnt_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct hl_device *hdev = dev_get_drvdata(dev);
-
-	return sprintf(buf, "%d\n", hdev->user_ctx ? 1 : 0);
-}
-
 static ssize_t soft_reset_cnt_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -450,18 +348,15 @@ static DEVICE_ATTR_RO(device_type);
 static DEVICE_ATTR_RO(fuse_ver);
 static DEVICE_ATTR_WO(hard_reset);
 static DEVICE_ATTR_RO(hard_reset_cnt);
-static DEVICE_ATTR_RW(high_pll);
 static DEVICE_ATTR_RO(infineon_ver);
 static DEVICE_ATTR_RW(max_power);
 static DEVICE_ATTR_RO(pci_addr);
-static DEVICE_ATTR_RW(pm_mng_profile);
 static DEVICE_ATTR_RO(preboot_btl_ver);
 static DEVICE_ATTR_WO(soft_reset);
 static DEVICE_ATTR_RO(soft_reset_cnt);
 static DEVICE_ATTR_RO(status);
 static DEVICE_ATTR_RO(thermal_ver);
 static DEVICE_ATTR_RO(uboot_ver);
-static DEVICE_ATTR_RO(write_open_cnt);
 
 static struct bin_attribute bin_attr_eeprom = {
 	.attr = {.name = "eeprom", .mode = (0444)},
@@ -477,18 +372,15 @@ static struct attribute *hl_dev_attrs[] = {
 	&dev_attr_fuse_ver.attr,
 	&dev_attr_hard_reset.attr,
 	&dev_attr_hard_reset_cnt.attr,
-	&dev_attr_high_pll.attr,
 	&dev_attr_infineon_ver.attr,
 	&dev_attr_max_power.attr,
 	&dev_attr_pci_addr.attr,
-	&dev_attr_pm_mng_profile.attr,
 	&dev_attr_preboot_btl_ver.attr,
 	&dev_attr_soft_reset.attr,
 	&dev_attr_soft_reset_cnt.attr,
 	&dev_attr_status.attr,
 	&dev_attr_thermal_ver.attr,
 	&dev_attr_uboot_ver.attr,
-	&dev_attr_write_open_cnt.attr,
 	NULL,
 };
author	Linus Torvalds <torvalds@linux-foundation.org>	2019-09-18 21:14:31 +0300
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-09-18 21:14:31 +0300
commit	6cfae0c26b21dce323fe8799b66cf4bc996e3565 (patch)
tree	647f80442929de7ed17cc436c546c21c8c2b2aa9 /drivers/misc/habanalabs
parent	e6874fc29410fabfdbc8c12b467f41a16cbcfd2b (diff)
parent	16a0f687cac70301f49d6f99c4115824e6aad42b (diff)
download	linux-6cfae0c26b21dce323fe8799b66cf4bc996e3565.tar.xz