diff options
Diffstat (limited to 'drivers/misc/habanalabs/common')
-rw-r--r-- | drivers/misc/habanalabs/common/Makefile | 2 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/command_submission.c | 105 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/context.c | 8 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/debugfs.c | 51 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/device.c | 159 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/firmware_if.c | 28 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h | 64 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_drv.c | 24 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/hwmgr.c | 117 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/hwmon.c | 194 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/irq.c | 5 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/memory.c | 515 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/mmu/mmu.c | 30 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/sysfs.c | 6 |
14 files changed, 1173 insertions, 135 deletions
diff --git a/drivers/misc/habanalabs/common/Makefile b/drivers/misc/habanalabs/common/Makefile index 6ebe3c7001ff..82c3824cad00 100644 --- a/drivers/misc/habanalabs/common/Makefile +++ b/drivers/misc/habanalabs/common/Makefile @@ -11,4 +11,4 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \ common/command_buffer.o common/hw_queue.o common/irq.o \ common/sysfs.o common/hwmon.o common/memory.o \ common/command_submission.o common/firmware_if.o \ - common/state_dump.o + common/state_dump.o common/hwmgr.o diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 6dafff375f1c..4c8000fd246c 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -143,6 +143,7 @@ static void hl_fence_init(struct hl_fence *fence, u64 sequence) fence->cs_sequence = sequence; fence->error = 0; fence->timestamp = ktime_set(0, 0); + fence->mcs_handling_done = false; init_completion(&fence->completion); } @@ -431,11 +432,10 @@ static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs) /* Don't cancel TDR in case this CS was timedout because we might be * running from the TDR context */ - if (cs && (cs->timedout || - hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT)) + if (cs->timedout || hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT) return; - if (cs && cs->tdr_active) + if (cs->tdr_active) cancel_delayed_work_sync(&cs->work_tdr); spin_lock(&hdev->cs_mirror_lock); @@ -536,10 +536,21 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs) mcs_compl->timestamp = ktime_to_ns(fence->timestamp); complete_all(&mcs_compl->completion); + + /* + * Setting mcs_handling_done inside the lock ensures + * at least one fence have mcs_handling_done set to + * true before wait for mcs finish. This ensures at + * least one CS will be set as completed when polling + * mcs fences. + */ + fence->mcs_handling_done = true; } spin_unlock(&mcs_compl->lock); } + /* In case CS completed without mcs completion initialized */ + fence->mcs_handling_done = true; } static inline void cs_release_sob_reset_handler(struct hl_device *hdev, @@ -2371,32 +2382,48 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data) break; } - mcs_data->stream_master_qid_map |= fence->stream_master_qid_map; - - if (status == CS_WAIT_STATUS_BUSY) - continue; - - mcs_data->completion_bitmap |= BIT(i); - - /* - * best effort to extract timestamp. few notes: - * - if even single fence is gone we cannot extract timestamp - * (as fence not exist anymore) - * - for all completed CSs we take the earliest timestamp. - * for this we have to validate that: - * 1. given timestamp was indeed set - * 2. the timestamp is earliest of all timestamps so far - */ + switch (status) { + case CS_WAIT_STATUS_BUSY: + /* CS did not finished, keep waiting on its QID*/ + mcs_data->stream_master_qid_map |= + fence->stream_master_qid_map; + break; + case CS_WAIT_STATUS_COMPLETED: + /* + * Using mcs_handling_done to avoid possibility of mcs_data + * returns to user indicating CS completed before it finished + * all of its mcs handling, to avoid race the next time the + * user waits for mcs. + */ + if (!fence->mcs_handling_done) + break; - if (status == CS_WAIT_STATUS_GONE) { + mcs_data->completion_bitmap |= BIT(i); + /* + * For all completed CSs we take the earliest timestamp. + * For this we have to validate that the timestamp is + * earliest of all timestamps so far. + */ + if (mcs_data->update_ts && + (ktime_compare(fence->timestamp, first_cs_time) < 0)) + first_cs_time = fence->timestamp; + break; + case CS_WAIT_STATUS_GONE: mcs_data->update_ts = false; mcs_data->gone_cs = true; - } else if (mcs_data->update_ts && - (ktime_compare(fence->timestamp, - ktime_set(0, 0)) > 0) && - (ktime_compare(fence->timestamp, first_cs_time) < 0)) { - first_cs_time = fence->timestamp; + /* + * It is possible to get an old sequence numbers from user + * which related to already completed CSs and their fences + * already gone. In this case, CS set as completed but + * no need to consider its QID for mcs completion. + */ + mcs_data->completion_bitmap |= BIT(i); + break; + default: + dev_err(hdev->dev, "Invalid fence status\n"); + return -EINVAL; } + } hl_fences_put(mcs_data->fence_arr, arr_len); @@ -2740,13 +2767,14 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u32 timeout_us, u64 user_address, - u32 target_value, u16 interrupt_offset, - enum hl_cs_wait_status *status) + u64 target_value, u16 interrupt_offset, + enum hl_cs_wait_status *status, + u64 *timestamp) { struct hl_user_pending_interrupt *pend; struct hl_user_interrupt *interrupt; unsigned long timeout, flags; - u32 completion_value; + u64 completion_value; long completion_rc; int rc = 0; @@ -2780,15 +2808,17 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, /* We check for completion value as interrupt could have been received * before we added the node to the wait list */ - if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) { + if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) { dev_err(hdev->dev, "Failed to copy completion value from user\n"); rc = -EFAULT; goto remove_pending_user_interrupt; } - if (completion_value >= target_value) + if (completion_value >= target_value) { *status = CS_WAIT_STATUS_COMPLETED; - else + /* There was no interrupt, we assume the completion is now. */ + pend->fence.timestamp = ktime_get(); + } else *status = CS_WAIT_STATUS_BUSY; if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED)) @@ -2812,7 +2842,7 @@ wait_again: reinit_completion(&pend->fence.completion); spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); - if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) { + if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) { dev_err(hdev->dev, "Failed to copy completion value from user\n"); rc = -EFAULT; @@ -2839,6 +2869,8 @@ remove_pending_user_interrupt: list_del(&pend->wait_list_node); spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + *timestamp = ktime_to_ns(pend->fence.timestamp); + kfree(pend); hl_ctx_put(ctx); @@ -2852,6 +2884,7 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) struct asic_fixed_properties *prop; union hl_wait_cs_args *args = data; enum hl_cs_wait_status status; + u64 timestamp; int rc; prop = &hdev->asic_prop; @@ -2881,7 +2914,8 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, args->in.interrupt_timeout_us, args->in.addr, - args->in.target, interrupt_offset, &status); + args->in.target, interrupt_offset, &status, + ×tamp); if (rc) { if (rc != -EINTR) @@ -2893,6 +2927,11 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data) memset(args, 0, sizeof(*args)); + if (timestamp) { + args->out.timestamp_nsec = timestamp; + args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD; + } + switch (status) { case CS_WAIT_STATUS_COMPLETED: args->out.status = HL_WAIT_CS_STATUS_COMPLETED; diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 22978303ad63..d0aaccd4df2c 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -181,12 +181,6 @@ out_err: return rc; } -void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx) -{ - if (kref_put(&ctx->refcount, hl_ctx_do_release) == 1) - return; -} - int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) { int rc = 0; @@ -392,7 +386,7 @@ void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr) idp = &mgr->ctx_handles; idr_for_each_entry(idp, ctx, id) - hl_ctx_free(hdev, ctx); + kref_put(&ctx->refcount, hl_ctx_do_release); idr_destroy(&mgr->ctx_handles); mutex_destroy(&mgr->ctx_lock); diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 985f1f3dbd20..1f2a3dc6c4e2 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -1167,6 +1167,45 @@ static ssize_t hl_state_dump_write(struct file *f, const char __user *buf, return count; } +static ssize_t hl_timeout_locked_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + char tmp_buf[200]; + ssize_t rc; + + if (*ppos) + return 0; + + sprintf(tmp_buf, "%d\n", + jiffies_to_msecs(hdev->timeout_jiffies) / 1000); + rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf, + strlen(tmp_buf) + 1); + + return rc; +} + +static ssize_t hl_timeout_locked_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hl_dbg_device_entry *entry = file_inode(f)->i_private; + struct hl_device *hdev = entry->hdev; + u32 value; + ssize_t rc; + + rc = kstrtouint_from_user(buf, count, 10, &value); + if (rc) + return rc; + + if (value) + hdev->timeout_jiffies = msecs_to_jiffies(value * 1000); + else + hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; + + return count; +} + static const struct file_operations hl_data32b_fops = { .owner = THIS_MODULE, .read = hl_data_read32, @@ -1240,6 +1279,12 @@ static const struct file_operations hl_state_dump_fops = { .write = hl_state_dump_write }; +static const struct file_operations hl_timeout_locked_fops = { + .owner = THIS_MODULE, + .read = hl_timeout_locked_read, + .write = hl_timeout_locked_write +}; + static const struct hl_info_list hl_debugfs_list[] = { {"command_buffers", command_buffers_show, NULL}, {"command_submission", command_submission_show, NULL}, @@ -1421,6 +1466,12 @@ void hl_debugfs_add_device(struct hl_device *hdev) dev_entry, &hl_state_dump_fops); + debugfs_create_file("timeout_locked", + 0644, + dev_entry->root, + dev_entry, + &hl_timeout_locked_fops); + for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { debugfs_create_file(hl_debugfs_list[i].name, 0444, diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 97c7c86580e6..2022e5d7b3ad 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -69,13 +69,6 @@ static void hpriv_release(struct kref *ref) mutex_destroy(&hpriv->restore_phase_mutex); - mutex_lock(&hdev->fpriv_list_lock); - list_del(&hpriv->dev_node); - hdev->compute_ctx = NULL; - mutex_unlock(&hdev->fpriv_list_lock); - - kfree(hpriv); - if ((!hdev->pldm) && (hdev->pdev) && (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, @@ -87,9 +80,32 @@ static void hpriv_release(struct kref *ref) device_is_idle = false; } + /* We need to remove the user from the list to make sure the reset process won't + * try to kill the user process. Because, if we got here, it means there are no + * more driver/device resources that the user process is occupying so there is + * no need to kill it + * + * However, we can't set the compute_ctx to NULL at this stage. This is to prevent + * a race between the release and opening the device again. We don't want to let + * a user open the device while there a reset is about to happen. + */ + mutex_lock(&hdev->fpriv_list_lock); + list_del(&hpriv->dev_node); + mutex_unlock(&hdev->fpriv_list_lock); + if ((hdev->reset_if_device_not_idle && !device_is_idle) || hdev->reset_upon_device_release) hl_device_reset(hdev, HL_RESET_DEVICE_RELEASE); + + /* Now we can mark the compute_ctx as empty. Even if a reset is running in a different + * thread, we don't care because the in_reset is marked so if a user will try to open + * the device it will fail on that, even if compute_ctx is NULL. + */ + mutex_lock(&hdev->fpriv_list_lock); + hdev->compute_ctx = NULL; + mutex_unlock(&hdev->fpriv_list_lock); + + kfree(hpriv); } void hl_hpriv_get(struct hl_fpriv *hpriv) @@ -530,6 +546,19 @@ static void hl_device_heartbeat(struct work_struct *work) return; reschedule: + /* + * prev_reset_trigger tracks consecutive fatal h/w errors until first + * heartbeat immediately post reset. + * If control reached here, then at least one heartbeat work has been + * scheduled since last reset/init cycle. + * So if the device is not already in reset cycle, reset the flag + * prev_reset_trigger as no reset occurred with HL_RESET_FW_FATAL_ERR + * status for at least one heartbeat. From this point driver restarts + * tracking future consecutive fatal errors. + */ + if (!(atomic_read(&hdev->in_reset))) + hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + schedule_delayed_work(&hdev->work_heartbeat, usecs_to_jiffies(HL_HEARTBEAT_PER_USEC)); } @@ -909,6 +938,65 @@ static void device_disable_open_processes(struct hl_device *hdev) mutex_unlock(&hdev->fpriv_list_lock); } +static void handle_reset_trigger(struct hl_device *hdev, u32 flags) +{ + u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + + /* + * 'reset cause' is being updated here, because getting here + * means that it's the 1st time and the last time we're here + * ('in_reset' makes sure of it). This makes sure that + * 'reset_cause' will continue holding its 1st recorded reason! + */ + if (flags & HL_RESET_HEARTBEAT) { + hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; + cur_reset_trigger = HL_RESET_HEARTBEAT; + } else if (flags & HL_RESET_TDR) { + hdev->curr_reset_cause = HL_RESET_CAUSE_TDR; + cur_reset_trigger = HL_RESET_TDR; + } else if (flags & HL_RESET_FW_FATAL_ERR) { + hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + cur_reset_trigger = HL_RESET_FW_FATAL_ERR; + } else { + hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + } + + /* + * If reset cause is same twice, then reset_trigger_repeated + * is set and if this reset is due to a fatal FW error + * device is set to an unstable state. + */ + if (hdev->prev_reset_trigger != cur_reset_trigger) { + hdev->prev_reset_trigger = cur_reset_trigger; + hdev->reset_trigger_repeated = 0; + } else { + hdev->reset_trigger_repeated = 1; + } + + /* If reset is due to heartbeat, device CPU is no responsive in + * which case no point sending PCI disable message to it. + * + * If F/W is performing the reset, no need to send it a message to disable + * PCI access + */ + if ((flags & HL_RESET_HARD) && + !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) { + /* Disable PCI access from device F/W so he won't send + * us additional interrupts. We disable MSI/MSI-X at + * the halt_engines function and we can't have the F/W + * sending us interrupts after that. We need to disable + * the access here because if the device is marked + * disable, the message won't be send. Also, in case + * of heartbeat, the device CPU is marked as disable + * so this message won't be sent + */ + if (hl_fw_send_pci_access_msg(hdev, + CPUCP_PACKET_DISABLE_PCI_ACCESS)) + dev_warn(hdev->dev, + "Failed to disable PCI access by F/W\n"); + } +} + /* * hl_device_reset - reset the device * @@ -954,7 +1042,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) goto do_reset; } - if (!hard_reset && !hdev->allow_external_soft_reset) { + if (!hard_reset && !hdev->allow_inference_soft_reset) { hard_instead_soft = true; hard_reset = true; } @@ -978,47 +1066,21 @@ do_reset: if (rc) return 0; - /* - * 'reset cause' is being updated here, because getting here - * means that it's the 1st time and the last time we're here - * ('in_reset' makes sure of it). This makes sure that - * 'reset_cause' will continue holding its 1st recorded reason! - */ - if (flags & HL_RESET_HEARTBEAT) - hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; - else if (flags & HL_RESET_TDR) - hdev->curr_reset_cause = HL_RESET_CAUSE_TDR; - else - hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; - - /* If reset is due to heartbeat, device CPU is no responsive in - * which case no point sending PCI disable message to it. - * - * If F/W is performing the reset, no need to send it a message to disable - * PCI access - */ - if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) { - /* Disable PCI access from device F/W so he won't send - * us additional interrupts. We disable MSI/MSI-X at - * the halt_engines function and we can't have the F/W - * sending us interrupts after that. We need to disable - * the access here because if the device is marked - * disable, the message won't be send. Also, in case - * of heartbeat, the device CPU is marked as disable - * so this message won't be sent - */ - if (hl_fw_send_pci_access_msg(hdev, - CPUCP_PACKET_DISABLE_PCI_ACCESS)) - dev_warn(hdev->dev, - "Failed to disable PCI access by F/W\n"); - } + handle_reset_trigger(hdev, flags); /* This also blocks future CS/VM/JOB completion operations */ hdev->disabled = true; take_release_locks(hdev); - dev_err(hdev->dev, "Going to RESET device!\n"); + if (hard_reset) + dev_info(hdev->dev, "Going to reset device\n"); + else if (flags & HL_RESET_DEVICE_RELEASE) + dev_info(hdev->dev, + "Going to reset device after it was released by user\n"); + else + dev_info(hdev->dev, + "Going to reset compute engines of inference device\n"); } again: @@ -1108,6 +1170,17 @@ kill_processes: hdev->device_cpu_disabled = false; hdev->hard_reset_pending = false; + if (hdev->reset_trigger_repeated && + (hdev->prev_reset_trigger == HL_RESET_FW_FATAL_ERR)) { + /* if there 2 back to back resets from FW, + * ensure driver puts the driver in a unusable state + */ + dev_crit(hdev->dev, + "Consecutive FW fatal errors received, stopping hard reset\n"); + rc = -EIO; + goto out_err; + } + if (hdev->kernel_ctx) { dev_crit(hdev->dev, "kernel ctx was alive during hard reset, something is terribly wrong\n"); diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 8d2568c63f19..4e68fb9d2a6b 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -2162,18 +2162,17 @@ static void hl_fw_linux_update_state(struct hl_device *hdev, } /** - * hl_fw_dynamic_report_reset_cause - send a COMMS message with the cause - * of the newly triggered hard reset + * hl_fw_dynamic_send_msg - send a COMMS message with attached data * * @hdev: pointer to the habanalabs device structure * @fw_loader: managing structure for loading device's FW - * @reset_cause: enumerated cause for the recent hard reset + * @msg_type: message type + * @data: data to be sent * * @return 0 on success, otherwise non-zero error code */ -static int hl_fw_dynamic_report_reset_cause(struct hl_device *hdev, - struct fw_load_mgr *fw_loader, - enum comms_reset_cause reset_cause) +static int hl_fw_dynamic_send_msg(struct hl_device *hdev, + struct fw_load_mgr *fw_loader, u8 msg_type, void *data) { struct lkd_msg_comms msg; int rc; @@ -2181,11 +2180,20 @@ static int hl_fw_dynamic_report_reset_cause(struct hl_device *hdev, memset(&msg, 0, sizeof(msg)); /* create message to be sent */ - msg.header.type = HL_COMMS_RESET_CAUSE_TYPE; + msg.header.type = msg_type; msg.header.size = cpu_to_le16(sizeof(struct comms_msg_header)); msg.header.magic = cpu_to_le32(HL_COMMS_MSG_MAGIC); - msg.reset_cause = reset_cause; + switch (msg_type) { + case HL_COMMS_RESET_CAUSE_TYPE: + msg.reset_cause = *(__u8 *) data; + break; + default: + dev_err(hdev->dev, + "Send COMMS message - invalid message type %u\n", + msg_type); + return -EINVAL; + } rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms)); @@ -2252,8 +2260,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, goto protocol_err; if (hdev->curr_reset_cause) { - rc = hl_fw_dynamic_report_reset_cause(hdev, fw_loader, - hdev->curr_reset_cause); + rc = hl_fw_dynamic_send_msg(hdev, fw_loader, + HL_COMMS_RESET_CAUSE_TYPE, &hdev->curr_reset_cause); if (rc) goto protocol_err; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index bebebcb163ee..a2002cbf794b 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -26,6 +26,7 @@ #include <linux/sched/signal.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/coresight.h> +#include <linux/dma-buf.h> #define HL_NAME "habanalabs" @@ -68,6 +69,9 @@ #define HL_STATE_DUMP_HIST_LEN 5 +/* Default value for device reset trigger , an invalid value */ +#define HL_RESET_TRIGGER_DEFAULT 0xFF + #define OBJ_NAMES_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ #define SYNC_TO_ENGINE_HASH_TABLE_BITS 7 /* 1 << 7 buckets */ @@ -132,13 +136,18 @@ enum hl_mmu_page_table_location { * - HL_RESET_FW * F/W will perform the reset. No need to ask it to reset the device. This is relevant * only when running with secured f/w + * + * - HL_RESET_FW_FATAL_ERR + * Set if reset is due to a fatal error from FW */ + #define HL_RESET_HARD (1 << 0) #define HL_RESET_FROM_RESET_THREAD (1 << 1) #define HL_RESET_HEARTBEAT (1 << 2) #define HL_RESET_TDR (1 << 3) #define HL_RESET_DEVICE_RELEASE (1 << 4) #define HL_RESET_FW (1 << 5) +#define HL_RESET_FW_FATAL_ERR (1 << 6) #define HL_MAX_SOBS_PER_MONITOR 8 @@ -447,6 +456,9 @@ struct hl_hints_range { * for hints validity check. * device_dma_offset_for_host_access: the offset to add to host DMA addresses * to enable the device to access them. + * @max_freq_value: current max clk frequency. + * @clk_pll_index: clock PLL index that specify which PLL determines the clock + * we display to the user * @mmu_pgt_size: MMU page tables total size. * @mmu_pte_size: PTE size in MMU page tables. * @mmu_hop_table_size: MMU hop table size. @@ -543,6 +555,8 @@ struct asic_fixed_properties { u64 cb_va_end_addr; u64 dram_hints_align_mask; u64 device_dma_offset_for_host_access; + u64 max_freq_value; + u32 clk_pll_index; u32 mmu_pgt_size; u32 mmu_pte_size; u32 mmu_hop_table_size; @@ -601,6 +615,9 @@ struct asic_fixed_properties { * masters QIDs that multi cs is waiting on * @error: mark this fence with error * @timestamp: timestamp upon completion + * @mcs_handling_done: indicates that corresponding command submission has + * finished msc handling, this does not mean it was part + * of the mcs */ struct hl_fence { struct completion completion; @@ -609,6 +626,7 @@ struct hl_fence { u32 stream_master_qid_map; int error; ktime_t timestamp; + u8 mcs_handling_done; }; /** @@ -1353,6 +1371,23 @@ struct hl_cs_counters_atomic { }; /** + * struct hl_dmabuf_priv - a dma-buf private object. + * @dmabuf: pointer to dma-buf object. + * @ctx: pointer to the dma-buf owner's context. + * @phys_pg_pack: pointer to physical page pack if the dma-buf was exported for + * memory allocation handle. + * @device_address: physical address of the device's memory. Relevant only + * if phys_pg_pack is NULL (dma-buf was exported from address). + * The total size can be taken from the dmabuf object. + */ +struct hl_dmabuf_priv { + struct dma_buf *dmabuf; + struct hl_ctx *ctx; + struct hl_vm_phys_pg_pack *phys_pg_pack; + uint64_t device_address; +}; + +/** * struct hl_ctx - user/kernel context. * @mem_hash: holds mapping from virtual address to virtual memory area * descriptor (hl_vm_phys_pg_list or hl_userptr). @@ -1662,6 +1697,7 @@ struct hl_vm_hw_block_list_node { * @npages: num physical pages in the pack. * @total_size: total size of all the pages in this list. * @mapping_cnt: number of shared mappings. + * @exporting_cnt: number of dma-buf exporting. * @asid: the context related to this list. * @page_size: size of each page in the pack. * @flags: HL_MEM_* flags related to this list. @@ -1676,6 +1712,7 @@ struct hl_vm_phys_pg_pack { u64 npages; u64 total_size; atomic_t mapping_cnt; + u32 exporting_cnt; u32 asid; u32 page_size; u32 flags; @@ -2396,6 +2433,7 @@ struct multi_cs_data { * the error will be ignored by the driver during * device initialization. Mainly used to debug and * workaround firmware bugs + * @dram_pci_bar_start: start bus address of PCIe bar towards DRAM. * @last_successful_open_jif: timestamp (jiffies) of the last successful * device open. * @last_open_session_duration_jif: duration (jiffies) of the last device open @@ -2440,8 +2478,12 @@ struct multi_cs_data { * @collective_mon_idx: helper index for collective initialization * @supports_coresight: is CoreSight supported. * @supports_soft_reset: is soft reset supported. - * @allow_external_soft_reset: true if soft reset initiated by user or TDR is - * allowed. + * @allow_inference_soft_reset: true if the ASIC supports soft reset that is + * initiated by user or TDR. This is only true + * in inference ASICs, as there is no real-world + * use-case of doing soft-reset in training (due + * to the fact that training runs on multiple + * devices) * @supports_cb_mapping: is mapping a CB to the device's MMU supported. * @needs_reset: true if reset_on_lockup is false and device should be reset * due to lockup. @@ -2452,6 +2494,10 @@ struct multi_cs_data { * @supports_staged_submission: true if staged submissions are supported * @curr_reset_cause: saves an enumerated reset cause when a hard reset is * triggered, and cleared after it is shared with preboot. + * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden + * with a new value on next reset + * @reset_trigger_repeated: set if device reset is triggered more than once with + * same cause. * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to * complete instead. * @device_cpu_is_halted: Flag to indicate whether the device CPU was already @@ -2537,6 +2583,7 @@ struct hl_device { u64 max_power; u64 clock_gating_mask; u64 boot_error_status_mask; + u64 dram_pci_bar_start; u64 last_successful_open_jif; u64 last_open_session_duration_jif; u64 open_counter; @@ -2572,13 +2619,15 @@ struct hl_device { u8 collective_mon_idx; u8 supports_coresight; u8 supports_soft_reset; - u8 allow_external_soft_reset; + u8 allow_inference_soft_reset; u8 supports_cb_mapping; u8 needs_reset; u8 process_kill_trial_cnt; u8 device_fini_pending; u8 supports_staged_submission; u8 curr_reset_cause; + u8 prev_reset_trigger; + u8 reset_trigger_repeated; u8 skip_reset_on_timeout; u8 device_cpu_is_halted; u8 supports_wait_for_multi_cs; @@ -2956,6 +3005,15 @@ int hl_set_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long value); int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value); +int hl_set_power(struct hl_device *hdev, + int sensor_index, u32 attr, long value); +int hl_get_power(struct hl_device *hdev, + int sensor_index, u32 attr, long *value); +int hl_get_clk_rate(struct hl_device *hdev, + u32 *cur_clk, u32 *max_clk); +void hl_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq); +void hl_add_device_attr(struct hl_device *hdev, + struct attribute_group *dev_attr_grp); void hw_sob_get(struct hl_hw_sob *hw_sob); void hw_sob_put(struct hl_hw_sob *hw_sob); void hl_encaps_handle_do_release(struct kref *ref); diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index a75e4fceb9d8..949d1b5c5c41 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -225,6 +225,17 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) if (!hpriv) return -ENOMEM; + /* Prevent other routines from reading partial hpriv data by + * initializing hpriv fields before inserting it to the list + */ + hpriv->hdev = hdev; + filp->private_data = hpriv; + hpriv->filp = filp; + hpriv->is_control = true; + nonseekable_open(inode, filp); + + hpriv->taskpid = find_get_pid(current->pid); + mutex_lock(&hdev->fpriv_list_lock); if (!hl_device_operational(hdev, NULL)) { @@ -238,19 +249,15 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) list_add(&hpriv->dev_node, &hdev->fpriv_list); mutex_unlock(&hdev->fpriv_list_lock); - hpriv->hdev = hdev; - filp->private_data = hpriv; - hpriv->filp = filp; - hpriv->is_control = true; - nonseekable_open(inode, filp); - - hpriv->taskpid = find_get_pid(current->pid); - return 0; out_err: mutex_unlock(&hdev->fpriv_list_lock); + filp->private_data = NULL; + put_pid(hpriv->taskpid); + kfree(hpriv); + return rc; } @@ -339,6 +346,7 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, set_driver_behavior_per_device(hdev); hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; + hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; if (timeout_locked) hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000); diff --git a/drivers/misc/habanalabs/common/hwmgr.c b/drivers/misc/habanalabs/common/hwmgr.c new file mode 100644 index 000000000000..5451019f143f --- /dev/null +++ b/drivers/misc/habanalabs/common/hwmgr.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2019-2021 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" + +void hl_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq) +{ + hl_set_frequency(hdev, hdev->asic_prop.clk_pll_index, + hdev->asic_prop.max_freq_value); +} + +int hl_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk) +{ + long value; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + value = hl_get_frequency(hdev, hdev->asic_prop.clk_pll_index, false); + + if (value < 0) { + dev_err(hdev->dev, "Failed to retrieve device max clock %ld\n", + value); + return value; + } + + *max_clk = (value / 1000 / 1000); + + value = hl_get_frequency(hdev, hdev->asic_prop.clk_pll_index, true); + + if (value < 0) { + dev_err(hdev->dev, + "Failed to retrieve device current clock %ld\n", + value); + return value; + } + + *cur_clk = (value / 1000 / 1000); + + return 0; +} + +static ssize_t clk_max_freq_mhz_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + long value; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + value = hl_get_frequency(hdev, hdev->asic_prop.clk_pll_index, false); + + hdev->asic_prop.max_freq_value = value; + + return sprintf(buf, "%lu\n", (value / 1000 / 1000)); +} + +static ssize_t clk_max_freq_mhz_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + int rc; + u64 value; + + if (!hl_device_operational(hdev, NULL)) { + count = -ENODEV; + goto fail; + } + + rc = kstrtoull(buf, 0, &value); + if (rc) { + count = -EINVAL; + goto fail; + } + + hdev->asic_prop.max_freq_value = value * 1000 * 1000; + + hl_set_frequency(hdev, hdev->asic_prop.clk_pll_index, + hdev->asic_prop.max_freq_value); + +fail: + return count; +} + +static ssize_t clk_cur_freq_mhz_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + long value; + + if (!hl_device_operational(hdev, NULL)) + return -ENODEV; + + value = hl_get_frequency(hdev, hdev->asic_prop.clk_pll_index, true); + + return sprintf(buf, "%lu\n", (value / 1000 / 1000)); +} + +static DEVICE_ATTR_RW(clk_max_freq_mhz); +static DEVICE_ATTR_RO(clk_cur_freq_mhz); + +static struct attribute *hl_dev_attrs[] = { + &dev_attr_clk_max_freq_mhz.attr, + &dev_attr_clk_cur_freq_mhz.attr, + NULL, +}; + +void hl_add_device_attr(struct hl_device *hdev, + struct attribute_group *dev_attr_grp) +{ + dev_attr_grp->attrs = hl_dev_attrs; +} diff --git a/drivers/misc/habanalabs/common/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c index 6b421d76b311..e33f65be8a00 100644 --- a/drivers/misc/habanalabs/common/hwmon.c +++ b/drivers/misc/habanalabs/common/hwmon.c @@ -113,6 +113,9 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type, { struct hl_device *hdev = dev_get_drvdata(dev); int rc; + u32 cpucp_attr; + bool use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false; if (!hl_device_operational(hdev, NULL)) return -ENODEV; @@ -121,65 +124,134 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type, case hwmon_temp: switch (attr) { case hwmon_temp_input: + cpucp_attr = cpucp_temp_input; + break; case hwmon_temp_max: + cpucp_attr = cpucp_temp_max; + break; case hwmon_temp_crit: + cpucp_attr = cpucp_temp_crit; + break; case hwmon_temp_max_hyst: + cpucp_attr = cpucp_temp_max_hyst; + break; case hwmon_temp_crit_hyst: + cpucp_attr = cpucp_temp_crit_hyst; + break; case hwmon_temp_offset: + cpucp_attr = cpucp_temp_offset; + break; case hwmon_temp_highest: + cpucp_attr = cpucp_temp_highest; break; default: return -EINVAL; } - rc = hl_get_temperature(hdev, channel, attr, val); + if (use_cpucp_enum) + rc = hl_get_temperature(hdev, channel, cpucp_attr, val); + else + rc = hl_get_temperature(hdev, channel, attr, val); break; case hwmon_in: switch (attr) { case hwmon_in_input: + cpucp_attr = cpucp_in_input; + break; case hwmon_in_min: + cpucp_attr = cpucp_in_min; + break; case hwmon_in_max: + cpucp_attr = cpucp_in_max; + break; case hwmon_in_highest: + cpucp_attr = cpucp_in_highest; break; default: return -EINVAL; } - rc = hl_get_voltage(hdev, channel, attr, val); + if (use_cpucp_enum) + rc = hl_get_voltage(hdev, channel, cpucp_attr, val); + else + rc = hl_get_voltage(hdev, channel, attr, val); break; case hwmon_curr: switch (attr) { case hwmon_curr_input: + cpucp_attr = cpucp_curr_input; + break; case hwmon_curr_min: + cpucp_attr = cpucp_curr_min; + break; case hwmon_curr_max: + cpucp_attr = cpucp_curr_max; + break; case hwmon_curr_highest: + cpucp_attr = cpucp_curr_highest; break; default: return -EINVAL; } - rc = hl_get_current(hdev, channel, attr, val); + if (use_cpucp_enum) + rc = hl_get_current(hdev, channel, cpucp_attr, val); + else + rc = hl_get_current(hdev, channel, attr, val); break; case hwmon_fan: switch (attr) { case hwmon_fan_input: + cpucp_attr = cpucp_fan_input; + break; case hwmon_fan_min: + cpucp_attr = cpucp_fan_min; + break; case hwmon_fan_max: + cpucp_attr = cpucp_fan_max; break; default: return -EINVAL; } - rc = hl_get_fan_speed(hdev, channel, attr, val); + + if (use_cpucp_enum) + rc = hl_get_fan_speed(hdev, channel, cpucp_attr, val); + else + rc = hl_get_fan_speed(hdev, channel, attr, val); break; case hwmon_pwm: switch (attr) { case hwmon_pwm_input: + cpucp_attr = cpucp_pwm_input; + break; case hwmon_pwm_enable: + cpucp_attr = cpucp_pwm_enable; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + rc = hl_get_pwm_info(hdev, channel, cpucp_attr, val); + else + rc = hl_get_pwm_info(hdev, channel, attr, val); + break; + case hwmon_power: + switch (attr) { + case hwmon_power_input: + cpucp_attr = CPUCP_POWER_INPUT; + break; + case hwmon_power_input_highest: + cpucp_attr = CPUCP_POWER_INPUT_HIGHEST; break; default: return -EINVAL; } - rc = hl_get_pwm_info(hdev, channel, attr, val); + + if (use_cpucp_enum) + rc = hl_get_power(hdev, channel, cpucp_attr, val); + else + rc = hl_get_power(hdev, channel, attr, val); break; default: return -EINVAL; @@ -191,6 +263,9 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long val) { struct hl_device *hdev = dev_get_drvdata(dev); + u32 cpucp_attr; + bool use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 & + CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false; if (!hl_device_operational(hdev, NULL)) return -ENODEV; @@ -199,40 +274,78 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type, case hwmon_temp: switch (attr) { case hwmon_temp_offset: + cpucp_attr = cpucp_temp_offset; + break; case hwmon_temp_reset_history: + cpucp_attr = cpucp_temp_reset_history; break; default: return -EINVAL; } - hl_set_temperature(hdev, channel, attr, val); + + if (use_cpucp_enum) + hl_set_temperature(hdev, channel, cpucp_attr, val); + else + hl_set_temperature(hdev, channel, attr, val); break; case hwmon_pwm: switch (attr) { case hwmon_pwm_input: + cpucp_attr = cpucp_pwm_input; + break; case hwmon_pwm_enable: + cpucp_attr = cpucp_pwm_enable; break; default: return -EINVAL; } - hl_set_pwm_info(hdev, channel, attr, val); + + if (use_cpucp_enum) + hl_set_pwm_info(hdev, channel, cpucp_attr, val); + else + hl_set_pwm_info(hdev, channel, attr, val); break; case hwmon_in: switch (attr) { case hwmon_in_reset_history: + cpucp_attr = cpucp_in_reset_history; break; default: return -EINVAL; } - hl_set_voltage(hdev, channel, attr, val); + + if (use_cpucp_enum) + hl_set_voltage(hdev, channel, cpucp_attr, val); + else + hl_set_voltage(hdev, channel, attr, val); break; case hwmon_curr: switch (attr) { case hwmon_curr_reset_history: + cpucp_attr = cpucp_curr_reset_history; break; default: return -EINVAL; } - hl_set_current(hdev, channel, attr, val); + + if (use_cpucp_enum) + hl_set_current(hdev, channel, cpucp_attr, val); + else + hl_set_current(hdev, channel, attr, val); + break; + case hwmon_power: + switch (attr) { + case hwmon_power_reset_history: + cpucp_attr = CPUCP_POWER_RESET_INPUT_HISTORY; + break; + default: + return -EINVAL; + } + + if (use_cpucp_enum) + hl_set_power(hdev, channel, cpucp_attr, val); + else + hl_set_power(hdev, channel, attr, val); break; default: return -EINVAL; @@ -296,6 +409,15 @@ static umode_t hl_is_visible(const void *data, enum hwmon_sensor_types type, return 0644; } break; + case hwmon_power: + switch (attr) { + case hwmon_power_input: + case hwmon_power_input_highest: + return 0444; + case hwmon_power_reset_history: + return 0200; + } + break; default: break; } @@ -551,6 +673,60 @@ int hl_set_current(struct hl_device *hdev, return rc; } +int hl_set_power(struct hl_device *hdev, + int sensor_index, u32 attr, long value) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + pkt.value = __cpu_to_le64(value); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, NULL); + + if (rc) + dev_err(hdev->dev, + "Failed to set power of sensor %d, error %d\n", + sensor_index, rc); + + return rc; +} + +int hl_get_power(struct hl_device *hdev, + int sensor_index, u32 attr, long *value) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.sensor_index = __cpu_to_le16(sensor_index); + pkt.type = __cpu_to_le16(attr); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + 0, &result); + + *value = (long) result; + + if (rc) { + dev_err(hdev->dev, + "Failed to get power of sensor %d, error %d\n", + sensor_index, rc); + *value = 0; + } + + return rc; +} + int hl_hwmon_init(struct hl_device *hdev) { struct device *dev = hdev->pdev ? &hdev->pdev->dev : hdev->dev; diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c index 39b14a933393..96d82b682674 100644 --- a/drivers/misc/habanalabs/common/irq.c +++ b/drivers/misc/habanalabs/common/irq.c @@ -141,10 +141,13 @@ static void handle_user_cq(struct hl_device *hdev, struct hl_user_interrupt *user_cq) { struct hl_user_pending_interrupt *pend; + ktime_t now = ktime_get(); spin_lock(&user_cq->wait_list_lock); - list_for_each_entry(pend, &user_cq->wait_list_head, wait_list_node) + list_for_each_entry(pend, &user_cq->wait_list_head, wait_list_node) { + pend->fence.timestamp = now; complete_all(&pend->fence.completion); + } spin_unlock(&user_cq->wait_list_lock); } diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 33986933aa9e..9bd626a00de3 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. */ @@ -11,6 +11,9 @@ #include <linux/uaccess.h> #include <linux/slab.h> +#include <linux/pci-p2pdma.h> + +MODULE_IMPORT_NS(DMA_BUF); #define HL_MMU_DEBUG 0 @@ -347,6 +350,12 @@ static int free_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args) return -EINVAL; } + if (phys_pg_pack->exporting_cnt) { + dev_dbg(hdev->dev, "handle %u is exported, cannot free\n", handle); + spin_unlock(&vm->idr_lock); + return -EINVAL; + } + /* * must remove from idr before the freeing of the physical * pages as the refcount of the pool is also the trigger of the @@ -1487,13 +1496,487 @@ int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma) return 0; } +static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size, + struct device *dev, enum dma_data_direction dir) +{ + dma_addr_t addr; + int rc; + + addr = dma_map_resource(dev, bar_address, chunk_size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + rc = dma_mapping_error(dev, addr); + if (rc) + return rc; + + sg_set_page(sg, NULL, chunk_size, 0); + sg_dma_address(sg) = addr; + sg_dma_len(sg) = chunk_size; + + return 0; +} + +static struct sg_table *alloc_sgt_from_device_pages(struct hl_device *hdev, u64 *pages, u64 npages, + u64 page_size, struct device *dev, + enum dma_data_direction dir) +{ + u64 chunk_size, bar_address, dma_max_seg_size; + struct asic_fixed_properties *prop; + int rc, i, j, nents, cur_page; + struct scatterlist *sg; + struct sg_table *sgt; + + prop = &hdev->asic_prop; + + dma_max_seg_size = dma_get_max_seg_size(dev); + + /* We would like to align the max segment size to PAGE_SIZE, so the + * SGL will contain aligned addresses that can be easily mapped to + * an MMU + */ + dma_max_seg_size = ALIGN_DOWN(dma_max_seg_size, PAGE_SIZE); + if (dma_max_seg_size < PAGE_SIZE) { + dev_err_ratelimited(hdev->dev, + "dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n", + dma_max_seg_size); + return ERR_PTR(-EINVAL); + } + + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) + return ERR_PTR(-ENOMEM); + + /* If the size of each page is larger than the dma max segment size, + * then we can't combine pages and the number of entries in the SGL + * will just be the + * <number of pages> * <chunks of max segment size in each page> + */ + if (page_size > dma_max_seg_size) + nents = npages * DIV_ROUND_UP_ULL(page_size, dma_max_seg_size); + else + /* Get number of non-contiguous chunks */ + for (i = 1, nents = 1, chunk_size = page_size ; i < npages ; i++) { + if (pages[i - 1] + page_size != pages[i] || + chunk_size + page_size > dma_max_seg_size) { + nents++; + chunk_size = page_size; + continue; + } + + chunk_size += page_size; + } + + rc = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO); + if (rc) + goto error_free; + + cur_page = 0; + + if (page_size > dma_max_seg_size) { + u64 size_left, cur_device_address = 0; + + size_left = page_size; + + /* Need to split each page into the number of chunks of + * dma_max_seg_size + */ + for_each_sgtable_dma_sg(sgt, sg, i) { + if (size_left == page_size) + cur_device_address = + pages[cur_page] - prop->dram_base_address; + else + cur_device_address += dma_max_seg_size; + + chunk_size = min(size_left, dma_max_seg_size); + + bar_address = hdev->dram_pci_bar_start + cur_device_address; + + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); + if (rc) + goto error_unmap; + + if (size_left > dma_max_seg_size) { + size_left -= dma_max_seg_size; + } else { + cur_page++; + size_left = page_size; + } + } + } else { + /* Merge pages and put them into the scatterlist */ + for_each_sgtable_dma_sg(sgt, sg, i) { + chunk_size = page_size; + for (j = cur_page + 1 ; j < npages ; j++) { + if (pages[j - 1] + page_size != pages[j] || + chunk_size + page_size > dma_max_seg_size) + break; + + chunk_size += page_size; + } + + bar_address = hdev->dram_pci_bar_start + + (pages[cur_page] - prop->dram_base_address); + + rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir); + if (rc) + goto error_unmap; + + cur_page = j; + } + } + + /* Because we are not going to include a CPU list we want to have some + * chance that other users will detect this by setting the orig_nents + * to 0 and using only nents (length of DMA list) when going over the + * sgl + */ + sgt->orig_nents = 0; + + return sgt; + +error_unmap: + for_each_sgtable_dma_sg(sgt, sg, i) { + if (!sg_dma_len(sg)) + continue; + + dma_unmap_resource(dev, sg_dma_address(sg), + sg_dma_len(sg), dir, + DMA_ATTR_SKIP_CPU_SYNC); + } + + sg_free_table(sgt); + +error_free: + kfree(sgt); + return ERR_PTR(rc); +} + +static int hl_dmabuf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct hl_dmabuf_priv *hl_dmabuf; + struct hl_device *hdev; + int rc; + + hl_dmabuf = dmabuf->priv; + hdev = hl_dmabuf->ctx->hdev; + + rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true); + + if (rc < 0) + attachment->peer2peer = false; + return 0; +} + +static struct sg_table *hl_map_dmabuf(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct dma_buf *dma_buf = attachment->dmabuf; + struct hl_vm_phys_pg_pack *phys_pg_pack; + struct hl_dmabuf_priv *hl_dmabuf; + struct hl_device *hdev; + struct sg_table *sgt; + + hl_dmabuf = dma_buf->priv; + hdev = hl_dmabuf->ctx->hdev; + phys_pg_pack = hl_dmabuf->phys_pg_pack; + + if (!attachment->peer2peer) { + dev_dbg(hdev->dev, "Failed to map dmabuf because p2p is disabled\n"); + return ERR_PTR(-EPERM); + } + + if (phys_pg_pack) + sgt = alloc_sgt_from_device_pages(hdev, + phys_pg_pack->pages, + phys_pg_pack->npages, + phys_pg_pack->page_size, + attachment->dev, + dir); + else + sgt = alloc_sgt_from_device_pages(hdev, + &hl_dmabuf->device_address, + 1, + hl_dmabuf->dmabuf->size, + attachment->dev, + dir); + + if (IS_ERR(sgt)) + dev_err(hdev->dev, "failed (%ld) to initialize sgt for dmabuf\n", PTR_ERR(sgt)); + + return sgt; +} + +static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + /* The memory behind the dma-buf has *always* resided on the device itself, i.e. it lives + * only in the 'device' domain (after all, it maps a PCI bar address which points to the + * device memory). + * + * Therefore, it was never in the 'CPU' domain and hence, there is no need to perform + * a sync of the memory to the CPU's cache, as it never resided inside that cache. + */ + for_each_sgtable_dma_sg(sgt, sg, i) + dma_unmap_resource(attachment->dev, sg_dma_address(sg), + sg_dma_len(sg), dir, + DMA_ATTR_SKIP_CPU_SYNC); + + /* Need to restore orig_nents because sg_free_table use that field */ + sgt->orig_nents = sgt->nents; + sg_free_table(sgt); + kfree(sgt); +} + +static void hl_release_dmabuf(struct dma_buf *dmabuf) +{ + struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv; + struct hl_ctx *ctx = hl_dmabuf->ctx; + struct hl_device *hdev = ctx->hdev; + struct hl_vm *vm = &hdev->vm; + + if (hl_dmabuf->phys_pg_pack) { + spin_lock(&vm->idr_lock); + hl_dmabuf->phys_pg_pack->exporting_cnt--; + spin_unlock(&vm->idr_lock); + } + + hl_ctx_put(hl_dmabuf->ctx); + + kfree(hl_dmabuf); +} + +static const struct dma_buf_ops habanalabs_dmabuf_ops = { + .attach = hl_dmabuf_attach, + .map_dma_buf = hl_map_dmabuf, + .unmap_dma_buf = hl_unmap_dmabuf, + .release = hl_release_dmabuf, +}; + +static int export_dmabuf_common(struct hl_ctx *ctx, + struct hl_dmabuf_priv *hl_dmabuf, + u64 total_size, int flags, int *dmabuf_fd) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct hl_device *hdev = ctx->hdev; + int rc, fd; + + exp_info.ops = &habanalabs_dmabuf_ops; + exp_info.size = total_size; + exp_info.flags = flags; + exp_info.priv = hl_dmabuf; + + hl_dmabuf->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(hl_dmabuf->dmabuf)) { + dev_err(hdev->dev, "failed to export dma-buf\n"); + return PTR_ERR(hl_dmabuf->dmabuf); + } + + fd = dma_buf_fd(hl_dmabuf->dmabuf, flags); + if (fd < 0) { + dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf\n"); + rc = fd; + goto err_dma_buf_put; + } + + hl_dmabuf->ctx = ctx; + hl_ctx_get(hdev, hl_dmabuf->ctx); + + *dmabuf_fd = fd; + + return 0; + +err_dma_buf_put: + dma_buf_put(hl_dmabuf->dmabuf); + return rc; +} + +/** + * export_dmabuf_from_addr() - export a dma-buf object for the given memory + * address and size. + * @ctx: pointer to the context structure. + * @device_addr: device memory physical address. + * @size: size of device memory. + * @flags: DMA-BUF file/FD flags. + * @dmabuf_fd: pointer to result FD that represents the dma-buf object. + * + * Create and export a dma-buf object for an existing memory allocation inside + * the device memory, and return a FD which is associated with the dma-buf + * object. + * + * Return: 0 on success, non-zero for failure. + */ +static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 device_addr, + u64 size, int flags, int *dmabuf_fd) +{ + struct hl_dmabuf_priv *hl_dmabuf; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop; + u64 bar_address; + int rc; + + prop = &hdev->asic_prop; + + if (!IS_ALIGNED(device_addr, PAGE_SIZE)) { + dev_dbg(hdev->dev, + "exported device memory address 0x%llx should be aligned to 0x%lx\n", + device_addr, PAGE_SIZE); + return -EINVAL; + } + + if (size < PAGE_SIZE) { + dev_dbg(hdev->dev, + "exported device memory size %llu should be equal to or greater than %lu\n", + size, PAGE_SIZE); + return -EINVAL; + } + + if (device_addr < prop->dram_user_base_address || + device_addr + size > prop->dram_end_address || + device_addr + size < device_addr) { + dev_dbg(hdev->dev, + "DRAM memory range 0x%llx (+0x%llx) is outside of DRAM boundaries\n", + device_addr, size); + return -EINVAL; + } + + bar_address = hdev->dram_pci_bar_start + + (device_addr - prop->dram_base_address); + + if (bar_address + size > + hdev->dram_pci_bar_start + prop->dram_pci_bar_size || + bar_address + size < bar_address) { + dev_dbg(hdev->dev, + "DRAM memory range 0x%llx (+0x%llx) is outside of PCI BAR boundaries\n", + device_addr, size); + return -EINVAL; + } + + hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL); + if (!hl_dmabuf) + return -ENOMEM; + + hl_dmabuf->device_address = device_addr; + + rc = export_dmabuf_common(ctx, hl_dmabuf, size, flags, dmabuf_fd); + if (rc) + goto err_free_dmabuf_wrapper; + + return 0; + +err_free_dmabuf_wrapper: + kfree(hl_dmabuf); + return rc; +} + +/** + * export_dmabuf_from_handle() - export a dma-buf object for the given memory + * handle. + * @ctx: pointer to the context structure. + * @handle: device memory allocation handle. + * @flags: DMA-BUF file/FD flags. + * @dmabuf_fd: pointer to result FD that represents the dma-buf object. + * + * Create and export a dma-buf object for an existing memory allocation inside + * the device memory, and return a FD which is associated with the dma-buf + * object. + * + * Return: 0 on success, non-zero for failure. + */ +static int export_dmabuf_from_handle(struct hl_ctx *ctx, u64 handle, int flags, + int *dmabuf_fd) +{ + struct hl_vm_phys_pg_pack *phys_pg_pack; + struct hl_dmabuf_priv *hl_dmabuf; + struct hl_device *hdev = ctx->hdev; + struct asic_fixed_properties *prop; + struct hl_vm *vm = &hdev->vm; + u64 bar_address; + int rc, i; + + prop = &hdev->asic_prop; + + if (upper_32_bits(handle)) { + dev_dbg(hdev->dev, "no match for handle 0x%llx\n", handle); + return -EINVAL; + } + + spin_lock(&vm->idr_lock); + + phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, (u32) handle); + if (!phys_pg_pack) { + spin_unlock(&vm->idr_lock); + dev_dbg(hdev->dev, "no match for handle 0x%x\n", (u32) handle); + return -EINVAL; + } + + /* increment now to avoid freeing device memory while exporting */ + phys_pg_pack->exporting_cnt++; + + spin_unlock(&vm->idr_lock); + + if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) { + dev_dbg(hdev->dev, "handle 0x%llx does not represent DRAM memory\n", handle); + rc = -EINVAL; + goto err_dec_exporting_cnt; + } + + for (i = 0 ; i < phys_pg_pack->npages ; i++) { + + bar_address = hdev->dram_pci_bar_start + + (phys_pg_pack->pages[i] - + prop->dram_base_address); + + if (bar_address + phys_pg_pack->page_size > + hdev->dram_pci_bar_start + prop->dram_pci_bar_size || + bar_address + phys_pg_pack->page_size < bar_address) { + + dev_dbg(hdev->dev, + "DRAM memory range 0x%llx (+0x%x) is outside of PCI BAR boundaries\n", + phys_pg_pack->pages[i], + phys_pg_pack->page_size); + + rc = -EINVAL; + goto err_dec_exporting_cnt; + } + } + + hl_dmabuf = kzalloc(sizeof(*hl_dmabuf), GFP_KERNEL); + if (!hl_dmabuf) { + rc = -ENOMEM; + goto err_dec_exporting_cnt; + } + + hl_dmabuf->phys_pg_pack = phys_pg_pack; + + rc = export_dmabuf_common(ctx, hl_dmabuf, phys_pg_pack->total_size, + flags, dmabuf_fd); + if (rc) + goto err_free_dmabuf_wrapper; + + return 0; + +err_free_dmabuf_wrapper: + kfree(hl_dmabuf); + +err_dec_exporting_cnt: + spin_lock(&vm->idr_lock); + phys_pg_pack->exporting_cnt--; + spin_unlock(&vm->idr_lock); + + return rc; +} + static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args) { struct hl_device *hdev = hpriv->hdev; struct hl_ctx *ctx = hpriv->ctx; u64 block_handle, device_addr = 0; u32 handle = 0, block_size; - int rc; + int rc, dmabuf_fd = -EBADF; switch (args->in.op) { case HL_MEM_OP_ALLOC: @@ -1542,6 +2025,16 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args) args->out.block_size = block_size; break; + case HL_MEM_OP_EXPORT_DMABUF_FD: + rc = export_dmabuf_from_addr(ctx, + args->in.export_dmabuf_fd.handle, + args->in.export_dmabuf_fd.mem_size, + args->in.flags, + &dmabuf_fd); + memset(args, 0, sizeof(*args)); + args->out.fd = dmabuf_fd; + break; + default: dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); rc = -ENOTTY; @@ -1560,7 +2053,7 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) struct hl_ctx *ctx = hpriv->ctx; u64 block_handle, device_addr = 0; u32 handle = 0, block_size; - int rc; + int rc, dmabuf_fd = -EBADF; if (!hl_device_operational(hdev, &status)) { dev_warn_ratelimited(hdev->dev, @@ -1651,6 +2144,22 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) args->out.block_size = block_size; break; + case HL_MEM_OP_EXPORT_DMABUF_FD: + if (hdev->asic_prop.dram_supports_virtual_memory) + rc = export_dmabuf_from_handle(ctx, + args->in.export_dmabuf_fd.handle, + args->in.flags, + &dmabuf_fd); + else + rc = export_dmabuf_from_addr(ctx, + args->in.export_dmabuf_fd.handle, + args->in.export_dmabuf_fd.mem_size, + args->in.flags, + &dmabuf_fd); + memset(args, 0, sizeof(*args)); + args->out.fd = dmabuf_fd; + break; + default: dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n"); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c index 792d25b79ea6..aa96917f62e5 100644 --- a/drivers/misc/habanalabs/common/mmu/mmu.c +++ b/drivers/misc/habanalabs/common/mmu/mmu.c @@ -501,23 +501,25 @@ static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr, if ((hops->range_type == HL_VA_RANGE_TYPE_DRAM) && !is_power_of_2(prop->dram_page_size)) { - unsigned long dram_page_size = prop->dram_page_size; - u64 page_offset_mask; - u64 phys_addr_mask; - u32 bit; + u64 dram_page_size, dram_base, abs_phys_addr, abs_virt_addr, + page_id, page_start; + u32 page_off; /* - * find last set bit in page_size to cover all bits of page - * offset. note that 1 has to be added to bit index. - * note that the internal ulong variable is used to avoid - * alignment issue. + * Bit arithmetics cannot be used for non power of two page + * sizes. In addition, since bit arithmetics is not used, + * we cannot ignore dram base. All that shall be considerd. */ - bit = find_last_bit(&dram_page_size, - sizeof(dram_page_size) * BITS_PER_BYTE) + 1; - page_offset_mask = (BIT_ULL(bit) - 1); - phys_addr_mask = ~page_offset_mask; - *phys_addr = (tmp_phys_addr & phys_addr_mask) | - (virt_addr & page_offset_mask); + + dram_page_size = prop->dram_page_size; + dram_base = prop->dram_base_address; + abs_phys_addr = tmp_phys_addr - dram_base; + abs_virt_addr = virt_addr - dram_base; + page_id = DIV_ROUND_DOWN_ULL(abs_phys_addr, dram_page_size); + page_start = page_id * dram_page_size; + div_u64_rem(abs_virt_addr, dram_page_size, &page_off); + + *phys_addr = page_start + page_off + dram_base; } else { /* * find the correct hop shift field in hl_mmu_properties diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 34f9f2779962..42c1769ad25d 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -206,12 +206,12 @@ static ssize_t soft_reset_store(struct device *dev, goto out; } - if (!hdev->allow_external_soft_reset) { - dev_err(hdev->dev, "Device does not support soft-reset\n"); + if (!hdev->allow_inference_soft_reset) { + dev_err(hdev->dev, "Device does not support inference soft-reset\n"); goto out; } - dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n"); + dev_warn(hdev->dev, "Inference Soft-Reset requested through sysfs\n"); hl_device_reset(hdev, 0); |