diff options
Diffstat (limited to 'drivers/gpu/drm/xe/xe_migrate.c')
-rw-r--r-- | drivers/gpu/drm/xe/xe_migrate.c | 391 |
1 files changed, 381 insertions, 10 deletions
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index cfd31ae49cc1..8f8e9fdfb2a8 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -97,7 +97,7 @@ struct xe_exec_queue *xe_tile_migrate_exec_queue(struct xe_tile *tile) return tile->migrate->q; } -static void xe_migrate_fini(struct drm_device *dev, void *arg) +static void xe_migrate_fini(void *arg) { struct xe_migrate *m = arg; @@ -209,7 +209,7 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, num_entries * XE_PAGE_SIZE, ttm_bo_type_kernel, XE_BO_FLAG_VRAM_IF_DGFX(tile) | - XE_BO_FLAG_PINNED); + XE_BO_FLAG_PAGETABLE); if (IS_ERR(bo)) return PTR_ERR(bo); @@ -400,7 +400,7 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile) struct xe_vm *vm; int err; - m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL); + m = devm_kzalloc(xe->drm.dev, sizeof(*m), GFP_KERNEL); if (!m) return ERR_PTR(-ENOMEM); @@ -454,7 +454,7 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile) might_lock(&m->job_mutex); fs_reclaim_release(GFP_KERNEL); - err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m); + err = devm_add_action_or_reset(xe->drm.dev, xe_migrate_fini, m); if (err) return ERR_PTR(err); @@ -669,6 +669,7 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, u32 mocs = 0; u32 tile_y = 0; + xe_gt_assert(gt, !(pitch & 3)); xe_gt_assert(gt, size / pitch <= S16_MAX); xe_gt_assert(gt, pitch / 4 <= S16_MAX); xe_gt_assert(gt, pitch <= U16_MAX); @@ -778,10 +779,12 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, bool dst_is_pltt = dst->mem_type == XE_PL_TT; bool src_is_vram = mem_type_is_vram(src->mem_type); bool dst_is_vram = mem_type_is_vram(dst->mem_type); + bool type_device = src_bo->ttm.type == ttm_bo_type_device; + bool needs_ccs_emit = type_device && xe_migrate_needs_ccs_emit(xe); bool copy_ccs = xe_device_has_flat_ccs(xe) && xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); - bool use_comp_pat = xe_device_has_flat_ccs(xe) && + bool use_comp_pat = type_device && xe_device_has_flat_ccs(xe) && GRAPHICS_VER(xe) >= 20 && src_is_vram && !dst_is_vram; /* Copying CCS between two different BOs is not supported yet. */ @@ -838,6 +841,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, avail_pts, avail_pts); if (copy_system_ccs) { + xe_assert(xe, type_device); ccs_size = xe_device_ccs_bytes(xe, src_L0); batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs, &ccs_pt, 0, @@ -848,7 +852,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, /* Add copy commands size here */ batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) + - ((xe_migrate_needs_ccs_emit(xe) ? EMIT_COPY_CCS_DW : 0)); + ((needs_ccs_emit ? EMIT_COPY_CCS_DW : 0)); bb = xe_bb_new(gt, batch_size, usm); if (IS_ERR(bb)) { @@ -877,7 +881,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, if (!copy_only_ccs) emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE); - if (xe_migrate_needs_ccs_emit(xe)) + if (needs_ccs_emit) flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, IS_DGFX(xe) ? src_is_vram : src_is_pltt, dst_L0_ofs, @@ -1176,7 +1180,7 @@ err: err_sync: /* Sync partial copies if any. FIXME: job_mutex? */ if (fence) { - dma_fence_wait(m->fence, false); + dma_fence_wait(fence, false); dma_fence_put(fence); } @@ -1350,6 +1354,7 @@ __xe_migrate_update_pgtables(struct xe_migrate *m, /* For sysmem PTE's, need to map them in our hole.. */ if (!IS_DGFX(xe)) { + u16 pat_index = xe->pat.idx[XE_CACHE_WB]; u32 ptes, ofs; ppgtt_ofs = NUM_KERNEL_PDE - 1; @@ -1409,7 +1414,7 @@ __xe_migrate_update_pgtables(struct xe_migrate *m, pt_bo->update_index = current_update; addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, - XE_CACHE_WB, 0); + pat_index, 0); bb->cs[bb->len++] = lower_32_bits(addr); bb->cs[bb->len++] = upper_32_bits(addr); } @@ -1504,7 +1509,7 @@ err_bb: * using the default engine for the updates, they will be performed in the * order they grab the job_mutex. If different engines are used, external * synchronization is needed for overlapping updates to maintain page-table - * consistency. Note that the meaing of "overlapping" is that the updates + * consistency. Note that the meaning of "overlapping" is that the updates * touch the same page-table, which might be a higher-level page-directory. * If no pipelining is needed, then updates may be performed by the cpu. * @@ -1542,6 +1547,372 @@ void xe_migrate_wait(struct xe_migrate *m) dma_fence_wait(m->fence, false); } +static u32 pte_update_cmd_size(u64 size) +{ + u32 num_dword; + u64 entries = DIV_U64_ROUND_UP(size, XE_PAGE_SIZE); + + XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER); + /* + * MI_STORE_DATA_IMM command is used to update page table. Each + * instruction can update maximumly 0x1ff pte entries. To update + * n (n <= 0x1ff) pte entries, we need: + * 1 dword for the MI_STORE_DATA_IMM command header (opcode etc) + * 2 dword for the page table's physical location + * 2*n dword for value of pte to fill (each pte entry is 2 dwords) + */ + num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, 0x1ff); + num_dword += entries * 2; + + return num_dword; +} + +static void build_pt_update_batch_sram(struct xe_migrate *m, + struct xe_bb *bb, u32 pt_offset, + dma_addr_t *sram_addr, u32 size) +{ + u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB]; + u32 ptes; + int i = 0; + + ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); + while (ptes) { + u32 chunk = min(0x1ffU, ptes); + + bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); + bb->cs[bb->len++] = pt_offset; + bb->cs[bb->len++] = 0; + + pt_offset += chunk * 8; + ptes -= chunk; + + while (chunk--) { + u64 addr = sram_addr[i++] & PAGE_MASK; + + xe_tile_assert(m->tile, addr); + addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, + addr, pat_index, + 0, false, 0); + bb->cs[bb->len++] = lower_32_bits(addr); + bb->cs[bb->len++] = upper_32_bits(addr); + } + } +} + +enum xe_migrate_copy_dir { + XE_MIGRATE_COPY_TO_VRAM, + XE_MIGRATE_COPY_TO_SRAM, +}; + +#define XE_CACHELINE_BYTES 64ull +#define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1) + +static struct dma_fence *xe_migrate_vram(struct xe_migrate *m, + unsigned long len, + unsigned long sram_offset, + dma_addr_t *sram_addr, u64 vram_addr, + const enum xe_migrate_copy_dir dir) +{ + struct xe_gt *gt = m->tile->primary_gt; + struct xe_device *xe = gt_to_xe(gt); + bool use_usm_batch = xe->info.has_usm; + struct dma_fence *fence = NULL; + u32 batch_size = 2; + u64 src_L0_ofs, dst_L0_ofs; + struct xe_sched_job *job; + struct xe_bb *bb; + u32 update_idx, pt_slot = 0; + unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE); + unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ? + PAGE_SIZE : 4; + int err; + + if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) || + (sram_offset | vram_addr) & XE_CACHELINE_MASK)) + return ERR_PTR(-EOPNOTSUPP); + + xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER); + + batch_size += pte_update_cmd_size(len); + batch_size += EMIT_COPY_DW; + + bb = xe_bb_new(gt, batch_size, use_usm_batch); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + return ERR_PTR(err); + } + + build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE, + sram_addr, len + sram_offset); + + if (dir == XE_MIGRATE_COPY_TO_VRAM) { + src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; + dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); + + } else { + src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); + dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; + } + + bb->cs[bb->len++] = MI_BATCH_BUFFER_END; + update_idx = bb->len; + + emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch); + + job = xe_bb_create_migration_job(m->q, bb, + xe_migrate_batch_base(m, use_usm_batch), + update_idx); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto err; + } + + xe_sched_job_add_migrate_flush(job, 0); + + mutex_lock(&m->job_mutex); + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + dma_fence_put(m->fence); + m->fence = dma_fence_get(fence); + mutex_unlock(&m->job_mutex); + + xe_bb_free(bb, fence); + + return fence; + +err: + xe_bb_free(bb, NULL); + + return ERR_PTR(err); +} + +/** + * xe_migrate_to_vram() - Migrate to VRAM + * @m: The migration context. + * @npages: Number of pages to migrate. + * @src_addr: Array of dma addresses (source of migrate) + * @dst_addr: Device physical address of VRAM (destination of migrate) + * + * Copy from an array dma addresses to a VRAM device physical address + * + * Return: dma fence for migrate to signal completion on succees, ERR_PTR on + * failure + */ +struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m, + unsigned long npages, + dma_addr_t *src_addr, + u64 dst_addr) +{ + return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr, + XE_MIGRATE_COPY_TO_VRAM); +} + +/** + * xe_migrate_from_vram() - Migrate from VRAM + * @m: The migration context. + * @npages: Number of pages to migrate. + * @src_addr: Device physical address of VRAM (source of migrate) + * @dst_addr: Array of dma addresses (destination of migrate) + * + * Copy from a VRAM device physical address to an array dma addresses + * + * Return: dma fence for migrate to signal completion on succees, ERR_PTR on + * failure + */ +struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m, + unsigned long npages, + u64 src_addr, + dma_addr_t *dst_addr) +{ + return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr, + XE_MIGRATE_COPY_TO_SRAM); +} + +static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr, + int len, int write) +{ + unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); + + for (i = 0; i < npages; ++i) { + if (!dma_addr[i]) + break; + + dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + } + kfree(dma_addr); +} + +static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe, + void *buf, int len, int write) +{ + dma_addr_t *dma_addr; + unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); + + dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL); + if (!dma_addr) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < npages; ++i) { + dma_addr_t addr; + struct page *page; + + if (is_vmalloc_addr(buf)) + page = vmalloc_to_page(buf); + else + page = virt_to_page(buf); + + addr = dma_map_page(xe->drm.dev, + page, 0, PAGE_SIZE, + write ? DMA_TO_DEVICE : + DMA_FROM_DEVICE); + if (dma_mapping_error(xe->drm.dev, addr)) + goto err_fault; + + dma_addr[i] = addr; + buf += PAGE_SIZE; + } + + return dma_addr; + +err_fault: + xe_migrate_dma_unmap(xe, dma_addr, len, write); + return ERR_PTR(-EFAULT); +} + +/** + * xe_migrate_access_memory - Access memory of a BO via GPU + * + * @m: The migration context. + * @bo: buffer object + * @offset: access offset into buffer object + * @buf: pointer to caller memory to read into or write from + * @len: length of access + * @write: write access + * + * Access memory of a BO via GPU either reading in or writing from a passed in + * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to + * read to or write from pointer. + * + * Returns: + * 0 if successful, negative error code on failure. + */ +int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, + unsigned long offset, void *buf, int len, + int write) +{ + struct xe_tile *tile = m->tile; + struct xe_device *xe = tile_to_xe(tile); + struct xe_res_cursor cursor; + struct dma_fence *fence = NULL; + dma_addr_t *dma_addr; + unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK; + int bytes_left = len, current_page = 0; + void *orig_buf = buf; + + xe_bo_assert_held(bo); + + /* Use bounce buffer for small access and unaligned access */ + if (len & XE_CACHELINE_MASK || + ((uintptr_t)buf | offset) & XE_CACHELINE_MASK) { + int buf_offset = 0; + + /* + * Less than ideal for large unaligned access but this should be + * fairly rare, can fixup if this becomes common. + */ + do { + u8 bounce[XE_CACHELINE_BYTES]; + void *ptr = (void *)bounce; + int err; + int copy_bytes = min_t(int, bytes_left, + XE_CACHELINE_BYTES - + (offset & XE_CACHELINE_MASK)); + int ptr_offset = offset & XE_CACHELINE_MASK; + + err = xe_migrate_access_memory(m, bo, + offset & + ~XE_CACHELINE_MASK, + (void *)ptr, + sizeof(bounce), 0); + if (err) + return err; + + if (write) { + memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes); + + err = xe_migrate_access_memory(m, bo, + offset & ~XE_CACHELINE_MASK, + (void *)ptr, + sizeof(bounce), 0); + if (err) + return err; + } else { + memcpy(buf + buf_offset, ptr + ptr_offset, + copy_bytes); + } + + bytes_left -= copy_bytes; + buf_offset += copy_bytes; + offset += copy_bytes; + } while (bytes_left); + + return 0; + } + + dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write); + if (IS_ERR(dma_addr)) + return PTR_ERR(dma_addr); + + xe_res_first(bo->ttm.resource, offset, bo->size - offset, &cursor); + + do { + struct dma_fence *__fence; + u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) + + cursor.start; + int current_bytes; + + if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER) + current_bytes = min_t(int, bytes_left, + MAX_PREEMPTDISABLE_TRANSFER); + else + current_bytes = min_t(int, bytes_left, cursor.size); + + if (fence) + dma_fence_put(fence); + + __fence = xe_migrate_vram(m, current_bytes, + (unsigned long)buf & ~PAGE_MASK, + dma_addr + current_page, + vram_addr, write ? + XE_MIGRATE_COPY_TO_VRAM : + XE_MIGRATE_COPY_TO_SRAM); + if (IS_ERR(__fence)) { + if (fence) + dma_fence_wait(fence, false); + fence = __fence; + goto out_err; + } + fence = __fence; + + buf += current_bytes; + offset += current_bytes; + current_page = (int)(buf - orig_buf) / PAGE_SIZE; + bytes_left -= current_bytes; + if (bytes_left) + xe_res_next(&cursor, current_bytes); + } while (bytes_left); + + dma_fence_wait(fence, false); + dma_fence_put(fence); + +out_err: + xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write); + return IS_ERR(fence) ? PTR_ERR(fence) : 0; +} + #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) #include "tests/xe_migrate.c" #endif |