diff options
Diffstat (limited to 'drivers/misc/habanalabs')
-rw-r--r-- | drivers/misc/habanalabs/command_submission.c | 2 | ||||
-rw-r--r-- | drivers/misc/habanalabs/device.c | 5 | ||||
-rw-r--r-- | drivers/misc/habanalabs/firmware_if.c | 22 | ||||
-rw-r--r-- | drivers/misc/habanalabs/goya/goya.c | 77 | ||||
-rw-r--r-- | drivers/misc/habanalabs/goya/goyaP.h | 2 | ||||
-rw-r--r-- | drivers/misc/habanalabs/habanalabs.h | 25 | ||||
-rw-r--r-- | drivers/misc/habanalabs/hw_queue.c | 14 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/goya/goya_packets.h | 13 | ||||
-rw-r--r-- | drivers/misc/habanalabs/irq.c | 27 | ||||
-rw-r--r-- | drivers/misc/habanalabs/memory.c | 2 |
10 files changed, 112 insertions, 77 deletions
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 6ad83d5ef4b0..f00d1c32f6d6 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -683,7 +683,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) rc = hl_poll_timeout_memory(hdev, &ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1), - 100, jiffies_to_usecs(hdev->timeout_jiffies)); + 100, jiffies_to_usecs(hdev->timeout_jiffies), false); if (rc == -ETIMEDOUT) { dev_err(hdev->dev, diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 0c4894dd9c02..7a8f9d0b71b5 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -970,7 +970,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); if (rc) { dev_err(hdev->dev, "failed to initialize kernel context\n"); - goto free_ctx; + kfree(hdev->kernel_ctx); + goto mmu_fini; } rc = hl_cb_pool_init(hdev); @@ -1053,8 +1054,6 @@ release_ctx: if (hl_ctx_put(hdev->kernel_ctx) != 1) dev_err(hdev->dev, "kernel ctx is still alive on initialization failure\n"); -free_ctx: - kfree(hdev->kernel_ctx); mmu_fini: hl_mmu_fini(hdev); eq_fini: diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c index cc8168bacb24..ea2ca67fbfbf 100644 --- a/drivers/misc/habanalabs/firmware_if.c +++ b/drivers/misc/habanalabs/firmware_if.c @@ -24,7 +24,7 @@ int hl_fw_push_fw_to_device(struct hl_device *hdev, const char *fw_name, { const struct firmware *fw; const u64 *fw_data; - size_t fw_size, i; + size_t fw_size; int rc; rc = request_firmware(&fw, fw_name, hdev->dev); @@ -45,22 +45,7 @@ int hl_fw_push_fw_to_device(struct hl_device *hdev, const char *fw_name, fw_data = (const u64 *) fw->data; - if ((fw->size % 8) != 0) - fw_size -= 8; - - for (i = 0 ; i < fw_size ; i += 8, fw_data++, dst += 8) { - if (!(i & (0x80000 - 1))) { - dev_dbg(hdev->dev, - "copied so far %zu out of %zu for %s firmware", - i, fw_size, fw_name); - usleep_range(20, 100); - } - - writeq(*fw_data, dst); - } - - if ((fw->size % 8) != 0) - writel(*(const u32 *) fw_data, dst); + memcpy_toio(dst, fw_data, fw_size); out: release_firmware(fw); @@ -112,7 +97,8 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, } rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp, - (tmp == ARMCP_PACKET_FENCE_VAL), 1000, timeout); + (tmp == ARMCP_PACKET_FENCE_VAL), 1000, + timeout, true); hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 1a2c062a57d4..271c5c8f53b4 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2729,9 +2729,10 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi) GOYA_ASYNC_EVENT_ID_PI_UPDATE); } -void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val) +void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd) { - /* Not needed in Goya */ + /* The QMANs are on the SRAM so need to copy to IO space */ + memcpy_toio((void __iomem *) pqe, bd, sizeof(struct hl_bd)); } static void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size, @@ -2864,7 +2865,8 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) } rc = hl_poll_timeout_memory(hdev, fence_ptr, tmp, - (tmp == GOYA_QMAN0_FENCE_VAL), 1000, timeout); + (tmp == GOYA_QMAN0_FENCE_VAL), 1000, + timeout, true); hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_DMA_0); @@ -2945,7 +2947,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id) } rc = hl_poll_timeout_memory(hdev, fence_ptr, tmp, (tmp == fence_val), - 1000, GOYA_TEST_QUEUE_WAIT_USEC); + 1000, GOYA_TEST_QUEUE_WAIT_USEC, true); hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); @@ -3312,9 +3314,11 @@ static int goya_validate_dma_pkt_no_mmu(struct hl_device *hdev, int rc; dev_dbg(hdev->dev, "DMA packet details:\n"); - dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr); - dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr); - dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize); + dev_dbg(hdev->dev, "source == 0x%llx\n", + le64_to_cpu(user_dma_pkt->src_addr)); + dev_dbg(hdev->dev, "destination == 0x%llx\n", + le64_to_cpu(user_dma_pkt->dst_addr)); + dev_dbg(hdev->dev, "size == %u\n", le32_to_cpu(user_dma_pkt->tsize)); ctl = le32_to_cpu(user_dma_pkt->ctl); user_dir = (ctl & GOYA_PKT_LIN_DMA_CTL_DMA_DIR_MASK) >> @@ -3343,9 +3347,11 @@ static int goya_validate_dma_pkt_mmu(struct hl_device *hdev, struct packet_lin_dma *user_dma_pkt) { dev_dbg(hdev->dev, "DMA packet details:\n"); - dev_dbg(hdev->dev, "source == 0x%llx\n", user_dma_pkt->src_addr); - dev_dbg(hdev->dev, "destination == 0x%llx\n", user_dma_pkt->dst_addr); - dev_dbg(hdev->dev, "size == %u\n", user_dma_pkt->tsize); + dev_dbg(hdev->dev, "source == 0x%llx\n", + le64_to_cpu(user_dma_pkt->src_addr)); + dev_dbg(hdev->dev, "destination == 0x%llx\n", + le64_to_cpu(user_dma_pkt->dst_addr)); + dev_dbg(hdev->dev, "size == %u\n", le32_to_cpu(user_dma_pkt->tsize)); /* * WA for HW-23. @@ -3385,7 +3391,8 @@ static int goya_validate_wreg32(struct hl_device *hdev, dev_dbg(hdev->dev, "WREG32 packet details:\n"); dev_dbg(hdev->dev, "reg_offset == 0x%x\n", reg_offset); - dev_dbg(hdev->dev, "value == 0x%x\n", wreg_pkt->value); + dev_dbg(hdev->dev, "value == 0x%x\n", + le32_to_cpu(wreg_pkt->value)); if (reg_offset != (mmDMA_CH_0_WR_COMP_ADDR_LO & 0x1FFF)) { dev_err(hdev->dev, "WREG32 packet with illegal address 0x%x\n", @@ -3427,12 +3434,13 @@ static int goya_validate_cb(struct hl_device *hdev, while (cb_parsed_length < parser->user_cb_size) { enum packet_id pkt_id; u16 pkt_size; - void *user_pkt; + struct goya_packet *user_pkt; - user_pkt = (void *) (uintptr_t) + user_pkt = (struct goya_packet *) (uintptr_t) (parser->user_cb->kernel_address + cb_parsed_length); - pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & + pkt_id = (enum packet_id) ( + (le64_to_cpu(user_pkt->header) & PACKET_HEADER_PACKET_ID_MASK) >> PACKET_HEADER_PACKET_ID_SHIFT); @@ -3452,7 +3460,8 @@ static int goya_validate_cb(struct hl_device *hdev, * need to validate here as well because patch_cb() is * not called in MMU path while this function is called */ - rc = goya_validate_wreg32(hdev, parser, user_pkt); + rc = goya_validate_wreg32(hdev, + parser, (struct packet_wreg32 *) user_pkt); break; case PACKET_WREG_BULK: @@ -3480,10 +3489,10 @@ static int goya_validate_cb(struct hl_device *hdev, case PACKET_LIN_DMA: if (is_mmu) rc = goya_validate_dma_pkt_mmu(hdev, parser, - user_pkt); + (struct packet_lin_dma *) user_pkt); else rc = goya_validate_dma_pkt_no_mmu(hdev, parser, - user_pkt); + (struct packet_lin_dma *) user_pkt); break; case PACKET_MSG_LONG: @@ -3656,15 +3665,16 @@ static int goya_patch_cb(struct hl_device *hdev, enum packet_id pkt_id; u16 pkt_size; u32 new_pkt_size = 0; - void *user_pkt, *kernel_pkt; + struct goya_packet *user_pkt, *kernel_pkt; - user_pkt = (void *) (uintptr_t) + user_pkt = (struct goya_packet *) (uintptr_t) (parser->user_cb->kernel_address + cb_parsed_length); - kernel_pkt = (void *) (uintptr_t) + kernel_pkt = (struct goya_packet *) (uintptr_t) (parser->patched_cb->kernel_address + cb_patched_cur_length); - pkt_id = (enum packet_id) (((*(u64 *) user_pkt) & + pkt_id = (enum packet_id) ( + (le64_to_cpu(user_pkt->header) & PACKET_HEADER_PACKET_ID_MASK) >> PACKET_HEADER_PACKET_ID_SHIFT); @@ -3679,15 +3689,18 @@ static int goya_patch_cb(struct hl_device *hdev, switch (pkt_id) { case PACKET_LIN_DMA: - rc = goya_patch_dma_packet(hdev, parser, user_pkt, - kernel_pkt, &new_pkt_size); + rc = goya_patch_dma_packet(hdev, parser, + (struct packet_lin_dma *) user_pkt, + (struct packet_lin_dma *) kernel_pkt, + &new_pkt_size); cb_patched_cur_length += new_pkt_size; break; case PACKET_WREG_32: memcpy(kernel_pkt, user_pkt, pkt_size); cb_patched_cur_length += pkt_size; - rc = goya_validate_wreg32(hdev, parser, kernel_pkt); + rc = goya_validate_wreg32(hdev, parser, + (struct packet_wreg32 *) kernel_pkt); break; case PACKET_WREG_BULK: @@ -4351,6 +4364,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr, size_t total_pkt_size; long result; int rc; + int irq_num_entries, irq_arr_index; + __le32 *goya_irq_arr; total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) + irq_arr_size; @@ -4368,8 +4383,16 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr, if (!pkt) return -ENOMEM; - pkt->length = cpu_to_le32(irq_arr_size / sizeof(irq_arr[0])); - memcpy(&pkt->irqs, irq_arr, irq_arr_size); + irq_num_entries = irq_arr_size / sizeof(irq_arr[0]); + pkt->length = cpu_to_le32(irq_num_entries); + + /* We must perform any necessary endianness conversation on the irq + * array being passed to the goya hardware + */ + for (irq_arr_index = 0, goya_irq_arr = (__le32 *) &pkt->irqs; + irq_arr_index < irq_num_entries ; irq_arr_index++) + goya_irq_arr[irq_arr_index] = + cpu_to_le32(irq_arr[irq_arr_index]); pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY << ARMCP_PKT_CTL_OPCODE_SHIFT); @@ -5041,7 +5064,7 @@ static const struct hl_asic_funcs goya_funcs = { .resume = goya_resume, .cb_mmap = goya_cb_mmap, .ring_doorbell = goya_ring_doorbell, - .flush_pq_write = goya_flush_pq_write, + .pqe_write = goya_pqe_write, .asic_dma_alloc_coherent = goya_dma_alloc_coherent, .asic_dma_free_coherent = goya_dma_free_coherent, .get_int_queue_base = goya_get_int_queue_base, diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h index f8c611883dc1..d7f48c9c41cd 100644 --- a/drivers/misc/habanalabs/goya/goyaP.h +++ b/drivers/misc/habanalabs/goya/goyaP.h @@ -177,7 +177,7 @@ int goya_late_init(struct hl_device *hdev); void goya_late_fini(struct hl_device *hdev); void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi); -void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val); +void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd); void goya_update_eq_ci(struct hl_device *hdev, u32 val); void goya_restore_phase_topology(struct hl_device *hdev); int goya_context_switch(struct hl_device *hdev, u32 asid); diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 10da9940ee0d..ce83adafcf2d 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -441,7 +441,11 @@ enum hl_pll_frequency { * @resume: handles IP specific H/W or SW changes for resume. * @cb_mmap: maps a CB. * @ring_doorbell: increment PI on a given QMAN. - * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed. + * @pqe_write: Write the PQ entry to the PQ. This is ASIC-specific + * function because the PQs are located in different memory areas + * per ASIC (SRAM, DRAM, Host memory) and therefore, the method of + * writing the PQE must match the destination memory area + * properties. * @asic_dma_alloc_coherent: Allocate coherent DMA memory by calling * dma_alloc_coherent(). This is ASIC function because * its implementation is not trivial when the driver @@ -510,7 +514,8 @@ struct hl_asic_funcs { int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma, u64 kaddress, phys_addr_t paddress, u32 size); void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi); - void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val); + void (*pqe_write)(struct hl_device *hdev, __le64 *pqe, + struct hl_bd *bd); void* (*asic_dma_alloc_coherent)(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, gfp_t flag); void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size, @@ -1062,9 +1067,17 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); /* * address in this macro points always to a memory location in the * host's (server's) memory. That location is updated asynchronously - * either by the direct access of the device or by another core + * either by the direct access of the device or by another core. + * + * To work both in LE and BE architectures, we need to distinguish between the + * two states (device or another core updates the memory location). Therefore, + * if mem_written_by_device is true, the host memory being polled will be + * updated directly by the device. If false, the host memory being polled will + * be updated by host CPU. Required so host knows whether or not the memory + * might need to be byte-swapped before returning value to caller. */ -#define hl_poll_timeout_memory(hdev, addr, val, cond, sleep_us, timeout_us) \ +#define hl_poll_timeout_memory(hdev, addr, val, cond, sleep_us, timeout_us, \ + mem_written_by_device) \ ({ \ ktime_t __timeout; \ /* timeout should be longer when working with simulator */ \ @@ -1077,10 +1090,14 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); /* Verify we read updates done by other cores or by device */ \ mb(); \ (val) = *((u32 *) (uintptr_t) (addr)); \ + if (mem_written_by_device) \ + (val) = le32_to_cpu(val); \ if (cond) \ break; \ if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \ (val) = *((u32 *) (uintptr_t) (addr)); \ + if (mem_written_by_device) \ + (val) = le32_to_cpu(val); \ break; \ } \ if (sleep_us) \ diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index e3b5517897ea..5f5673b74985 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -290,23 +290,19 @@ static void int_hw_queue_schedule_job(struct hl_cs_job *job) struct hl_device *hdev = job->cs->ctx->hdev; struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; struct hl_bd bd; - u64 *pi, *pbd = (u64 *) &bd; + __le64 *pi; bd.ctl = 0; - bd.len = __cpu_to_le32(job->job_cb_size); - bd.ptr = __cpu_to_le64((u64) (uintptr_t) job->user_cb); + bd.len = cpu_to_le32(job->job_cb_size); + bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb); - pi = (u64 *) (uintptr_t) (q->kernel_address + + pi = (__le64 *) (uintptr_t) (q->kernel_address + ((q->pi & (q->int_queue_len - 1)) * sizeof(bd))); - pi[0] = pbd[0]; - pi[1] = pbd[1]; - q->pi++; q->pi &= ((q->int_queue_len << 1) - 1); - /* Flush PQ entry write. Relevant only for specific ASICs */ - hdev->asic_funcs->flush_pq_write(hdev, pi, pbd[0]); + hdev->asic_funcs->pqe_write(hdev, pi, &bd); hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); } diff --git a/drivers/misc/habanalabs/include/goya/goya_packets.h b/drivers/misc/habanalabs/include/goya/goya_packets.h index a14407b975e4..ef54bad20509 100644 --- a/drivers/misc/habanalabs/include/goya/goya_packets.h +++ b/drivers/misc/habanalabs/include/goya/goya_packets.h @@ -52,6 +52,19 @@ enum goya_dma_direction { #define GOYA_PKT_CTL_MB_SHIFT 31 #define GOYA_PKT_CTL_MB_MASK 0x80000000 +/* All packets have, at least, an 8-byte header, which contains + * the packet type. The kernel driver uses the packet header for packet + * validation and to perform any necessary required preparation before + * sending them off to the hardware. + */ +struct goya_packet { + __le64 header; + /* The rest of the packet data follows. Use the corresponding + * packet_XXX struct to deference the data, based on packet type + */ + u8 contents[0]; +}; + struct packet_nop { __le32 reserved; __le32 ctl; diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c index ea9f72ff456c..199791b57caf 100644 --- a/drivers/misc/habanalabs/irq.c +++ b/drivers/misc/habanalabs/irq.c @@ -80,8 +80,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) struct hl_cs_job *job; bool shadow_index_valid; u16 shadow_index; - u32 *cq_entry; - u32 *cq_base; + struct hl_cq_entry *cq_entry, *cq_base; if (hdev->disabled) { dev_dbg(hdev->dev, @@ -90,29 +89,29 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) return IRQ_HANDLED; } - cq_base = (u32 *) (uintptr_t) cq->kernel_address; + cq_base = (struct hl_cq_entry *) (uintptr_t) cq->kernel_address; while (1) { - bool entry_ready = ((cq_base[cq->ci] & CQ_ENTRY_READY_MASK) + bool entry_ready = ((le32_to_cpu(cq_base[cq->ci].data) & + CQ_ENTRY_READY_MASK) >> CQ_ENTRY_READY_SHIFT); if (!entry_ready) break; - cq_entry = (u32 *) &cq_base[cq->ci]; + cq_entry = (struct hl_cq_entry *) &cq_base[cq->ci]; - /* - * Make sure we read CQ entry contents after we've + /* Make sure we read CQ entry contents after we've * checked the ownership bit. */ dma_rmb(); - shadow_index_valid = - ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_VALID_MASK) + shadow_index_valid = ((le32_to_cpu(cq_entry->data) & + CQ_ENTRY_SHADOW_INDEX_VALID_MASK) >> CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT); - shadow_index = (u16) - ((*cq_entry & CQ_ENTRY_SHADOW_INDEX_MASK) + shadow_index = (u16) ((le32_to_cpu(cq_entry->data) & + CQ_ENTRY_SHADOW_INDEX_MASK) >> CQ_ENTRY_SHADOW_INDEX_SHIFT); queue = &hdev->kernel_queues[cq->hw_queue_id]; @@ -122,8 +121,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) queue_work(hdev->cq_wq, &job->finish_work); } - /* - * Update ci of the context's queue. There is no + /* Update ci of the context's queue. There is no * need to protect it with spinlock because this update is * done only inside IRQ and there is a different IRQ per * queue @@ -131,7 +129,8 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) queue->ci = hl_queue_inc_ptr(queue->ci); /* Clear CQ entry ready bit */ - cq_base[cq->ci] &= ~CQ_ENTRY_READY_MASK; + cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) & + ~CQ_ENTRY_READY_MASK); cq->ci = hl_cq_inc_ptr(cq->ci); diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c index 42d237cae1dc..365fb0cb8dff 100644 --- a/drivers/misc/habanalabs/memory.c +++ b/drivers/misc/habanalabs/memory.c @@ -1629,6 +1629,8 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx) dev_dbg(hdev->dev, "page list 0x%p of asid %d is still alive\n", phys_pg_list, ctx->asid); + atomic64_sub(phys_pg_list->total_size, + &hdev->dram_used_mem); free_phys_pg_pack(hdev, phys_pg_list); idr_remove(&vm->phys_pg_pack_handles, i); } |