diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 174 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm | 7 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 33 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 51 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device.c | 29 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 48 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 8 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 40 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_iommu.c | 10 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_module.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 59 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process.c | 193 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 105 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 53 |
17 files changed, 582 insertions, 240 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 577d901fdb63..affbca7c0050 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -911,7 +911,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x705d0000, 0x807c817c, 0x8070ff70, 0x00000080, 0xbf0a7b7c, 0xbf85fff8, - 0xbf82014f, 0xbef4037e, + 0xbf820151, 0xbef4037e, 0x8775ff7f, 0x0000ffff, 0x8875ff75, 0x00040000, 0xbef60380, 0xbef703ff, @@ -1024,61 +1024,62 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbe883108, 0xbe8a310a, 0xbe8c310c, 0xbe8e310e, 0xbf06807c, 0xbf84fff0, - 0xb9782a05, 0x80788178, - 0xbf0d9972, 0xbf850002, - 0x8f788978, 0xbf820001, - 0x8f788a78, 0xb96e1e06, - 0x8f6e8a6e, 0x80786e78, - 0x8078ff78, 0x00000200, - 0xbef603ff, 0x01000000, - 0xf4211bfa, 0xf0000000, - 0x80788478, 0xf4211b3a, + 0xba80f801, 0x00000000, + 0xbf8a0000, 0xb9782a05, + 0x80788178, 0xbf0d9972, + 0xbf850002, 0x8f788978, + 0xbf820001, 0x8f788a78, + 0xb96e1e06, 0x8f6e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0xbef603ff, + 0x01000000, 0xf4211bfa, 0xf0000000, 0x80788478, - 0xf4211b7a, 0xf0000000, - 0x80788478, 0xf4211c3a, + 0xf4211b3a, 0xf0000000, + 0x80788478, 0xf4211b7a, 0xf0000000, 0x80788478, - 0xf4211c7a, 0xf0000000, - 0x80788478, 0xf4211eba, + 0xf4211c3a, 0xf0000000, + 0x80788478, 0xf4211c7a, 0xf0000000, 0x80788478, - 0xf4211efa, 0xf0000000, - 0x80788478, 0xf4211e7a, + 0xf4211eba, 0xf0000000, + 0x80788478, 0xf4211efa, 0xf0000000, 0x80788478, - 0xf4211cfa, 0xf0000000, - 0x80788478, 0xf4211bba, + 0xf4211e7a, 0xf0000000, + 0x80788478, 0xf4211cfa, 0xf0000000, 0x80788478, - 0xbf8cc07f, 0xb9eef814, 0xf4211bba, 0xf0000000, 0x80788478, 0xbf8cc07f, - 0xb9eef815, 0xbefc036f, - 0xbefe0370, 0xbeff0371, - 0x876f7bff, 0x000003ff, - 0xb9ef4803, 0xb9f9f816, - 0x876f7bff, 0xfffff800, - 0x906f8b6f, 0xb9efa2c3, - 0xb9f3f801, 0xb96e2a05, - 0x806e816e, 0xbf0d9972, - 0xbf850002, 0x8f6e896e, - 0xbf820001, 0x8f6e8a6e, - 0x806eff6e, 0x00000200, - 0x806e746e, 0x826f8075, - 0x876fff6f, 0x0000ffff, - 0xf4091c37, 0xfa000050, - 0xf4091d37, 0xfa000060, - 0xf4011e77, 0xfa000074, - 0xbf8cc07f, 0x876fff6d, - 0xfc000000, 0x906f9a6f, - 0x8f6f906f, 0xbeee0380, + 0xb9eef814, 0xf4211bba, + 0xf0000000, 0x80788478, + 0xbf8cc07f, 0xb9eef815, + 0xbefc036f, 0xbefe0370, + 0xbeff0371, 0x876f7bff, + 0x000003ff, 0xb9ef4803, + 0xb9f9f816, 0x876f7bff, + 0xfffff800, 0x906f8b6f, + 0xb9efa2c3, 0xb9f3f801, + 0xb96e2a05, 0x806e816e, + 0xbf0d9972, 0xbf850002, + 0x8f6e896e, 0xbf820001, + 0x8f6e8a6e, 0x806eff6e, + 0x00000200, 0x806e746e, + 0x826f8075, 0x876fff6f, + 0x0000ffff, 0xf4091c37, + 0xfa000050, 0xf4091d37, + 0xfa000060, 0xf4011e77, + 0xfa000074, 0xbf8cc07f, + 0x876fff6d, 0xfc000000, + 0x906f9a6f, 0x8f6f906f, + 0xbeee0380, 0x886e6f6e, + 0x876fff6d, 0x02000000, + 0x906f996f, 0x8f6f8f6f, 0x886e6f6e, 0x876fff6d, - 0x02000000, 0x906f996f, - 0x8f6f8f6f, 0x886e6f6e, - 0x876fff6d, 0x01000000, - 0x906f986f, 0x8f6f996f, - 0x886e6f6e, 0x876fff7a, - 0x00800000, 0x906f976f, - 0xb9eef807, 0x876dff6d, - 0x0000ffff, 0x87fe7e7e, - 0x87ea6a6a, 0xb9faf802, - 0xbf8a0000, 0xbe80226c, + 0x01000000, 0x906f986f, + 0x8f6f996f, 0x886e6f6e, + 0x876fff7a, 0x00800000, + 0x906f976f, 0xb9eef807, + 0x876dff6d, 0x0000ffff, + 0x87fe7e7e, 0x87ea6a6a, + 0xb9faf802, 0xbe80226c, 0xbf810000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, @@ -1807,7 +1808,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x705d0000, 0x807c817c, 0x8070ff70, 0x00000080, 0xbf0a7b7c, 0xbf85fff8, - 0xbf82013a, 0xbef4037e, + 0xbf82013c, 0xbef4037e, 0x8775ff7f, 0x0000ffff, 0x8875ff75, 0x00040000, 0xbef60380, 0xbef703ff, @@ -1920,50 +1921,51 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbe883108, 0xbe8a310a, 0xbe8c310c, 0xbe8e310e, 0xbf06807c, 0xbf84fff0, - 0xb9782a05, 0x80788178, - 0xbf0d9972, 0xbf850002, - 0x8f788978, 0xbf820001, - 0x8f788a78, 0xb96e1e06, - 0x8f6e8a6e, 0x80786e78, - 0x8078ff78, 0x00000200, - 0xbef603ff, 0x01000000, - 0xf4211bfa, 0xf0000000, - 0x80788478, 0xf4211b3a, + 0xba80f801, 0x00000000, + 0xbf8a0000, 0xb9782a05, + 0x80788178, 0xbf0d9972, + 0xbf850002, 0x8f788978, + 0xbf820001, 0x8f788a78, + 0xb96e1e06, 0x8f6e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0xbef603ff, + 0x01000000, 0xf4211bfa, 0xf0000000, 0x80788478, - 0xf4211b7a, 0xf0000000, - 0x80788478, 0xf4211c3a, + 0xf4211b3a, 0xf0000000, + 0x80788478, 0xf4211b7a, 0xf0000000, 0x80788478, - 0xf4211c7a, 0xf0000000, - 0x80788478, 0xf4211eba, + 0xf4211c3a, 0xf0000000, + 0x80788478, 0xf4211c7a, 0xf0000000, 0x80788478, - 0xf4211efa, 0xf0000000, - 0x80788478, 0xf4211e7a, + 0xf4211eba, 0xf0000000, + 0x80788478, 0xf4211efa, 0xf0000000, 0x80788478, - 0xf4211cfa, 0xf0000000, - 0x80788478, 0xf4211bba, + 0xf4211e7a, 0xf0000000, + 0x80788478, 0xf4211cfa, 0xf0000000, 0x80788478, - 0xbf8cc07f, 0xb9eef814, 0xf4211bba, 0xf0000000, 0x80788478, 0xbf8cc07f, - 0xb9eef815, 0xbefc036f, - 0xbefe0370, 0xbeff0371, - 0x876f7bff, 0x000003ff, - 0xb9ef4803, 0x876f7bff, - 0xfffff800, 0x906f8b6f, - 0xb9efa2c3, 0xb9f3f801, - 0xb96e2a05, 0x806e816e, - 0xbf0d9972, 0xbf850002, - 0x8f6e896e, 0xbf820001, - 0x8f6e8a6e, 0x806eff6e, - 0x00000200, 0x806e746e, - 0x826f8075, 0x876fff6f, - 0x0000ffff, 0xf4091c37, - 0xfa000050, 0xf4091d37, - 0xfa000060, 0xf4011e77, - 0xfa000074, 0xbf8cc07f, - 0x876dff6d, 0x0000ffff, - 0x87fe7e7e, 0x87ea6a6a, - 0xb9faf802, 0xbf8a0000, + 0xb9eef814, 0xf4211bba, + 0xf0000000, 0x80788478, + 0xbf8cc07f, 0xb9eef815, + 0xbefc036f, 0xbefe0370, + 0xbeff0371, 0x876f7bff, + 0x000003ff, 0xb9ef4803, + 0x876f7bff, 0xfffff800, + 0x906f8b6f, 0xb9efa2c3, + 0xb9f3f801, 0xb96e2a05, + 0x806e816e, 0xbf0d9972, + 0xbf850002, 0x8f6e896e, + 0xbf820001, 0x8f6e8a6e, + 0x806eff6e, 0x00000200, + 0x806e746e, 0x826f8075, + 0x876fff6f, 0x0000ffff, + 0xf4091c37, 0xfa000050, + 0xf4091d37, 0xfa000060, + 0xf4011e77, 0xfa000074, + 0xbf8cc07f, 0x876dff6d, + 0x0000ffff, 0x87fe7e7e, + 0x87ea6a6a, 0xb9faf802, 0xbe80226c, 0xbf810000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 5b220f2a7501..5081f91190b8 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -894,6 +894,11 @@ L_RESTORE_SGPR: s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0 s_cbranch_scc0 L_RESTORE_SGPR_LOOP + // s_barrier with MODE.DEBUG_EN=1, STATUS.PRIV=1 incorrectly asserts debug exception. + // Clear DEBUG_EN before and restore MODE after the barrier. + s_setreg_imm32_b32 hwreg(HW_REG_MODE), 0 + s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG + /* restore HW registers */ L_RESTORE_HWREG: // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) @@ -976,8 +981,6 @@ L_RESTORE_HWREG: s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu - s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG - s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution L_END_PGM: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index e9b96ad3d9a5..222f1df1a6b6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -97,6 +97,7 @@ void kfd_chardev_exit(void) device_destroy(kfd_class, MKDEV(kfd_char_dev_major, 0)); class_destroy(kfd_class); unregister_chrdev(kfd_char_dev_major, kfd_dev_name); + kfd_device = NULL; } struct device *kfd_chardev(void) @@ -1254,7 +1255,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev) return true; } - if (dev->device_info->needs_iommu_device) + if (dev->use_iommu_v2) return false; amdgpu_amdkfd_get_local_mem_info(dev->kgd, &mem_info); @@ -1290,18 +1291,6 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, return -EINVAL; } - if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { - if (args->size != kfd_doorbell_process_slice(dev)) - return -EINVAL; - offset = kfd_get_process_doorbells(dev, p); - } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { - if (args->size != PAGE_SIZE) - return -EINVAL; - offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd); - if (!offset) - return -ENOMEM; - } - mutex_lock(&p->mutex); pdd = kfd_bind_process_to_device(dev, p); @@ -1310,6 +1299,24 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, goto err_unlock; } + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { + if (args->size != kfd_doorbell_process_slice(dev)) { + err = -EINVAL; + goto err_unlock; + } + offset = kfd_get_process_doorbells(pdd); + } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + if (args->size != PAGE_SIZE) { + err = -EINVAL; + goto err_unlock; + } + offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd); + if (!offset) { + err = -ENOMEM; + goto err_unlock; + } + } + err = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( dev->kgd, args->va_addr, args->size, pdd->vm, (struct kgd_mem **) &mem, &offset, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 6a250f8fcfb8..d2981524dba0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -742,6 +742,22 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, return 0; } +static bool kfd_ignore_crat(void) +{ + bool ret; + + if (ignore_crat) + return true; + +#ifndef KFD_SUPPORT_IOMMU_V2 + ret = true; +#else + ret = false; +#endif + + return ret; +} + /* * kfd_create_crat_image_acpi - Allocates memory for CRAT image and * copies CRAT from ACPI (if available). @@ -776,12 +792,13 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size) return -EINVAL; } - if (ignore_crat) { + if (kfd_ignore_crat()) { pr_info("CRAT table disabled by module option\n"); return -ENODATA; } - pcrat_image = kmemdup(crat_table, crat_table->length, GFP_KERNEL); + pcrat_image = kvmalloc(crat_table->length, GFP_KERNEL); + memcpy(pcrat_image, crat_table, crat_table->length); if (!pcrat_image) return -ENOMEM; @@ -793,11 +810,10 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size) /* Memory required to create Virtual CRAT. * Since there is no easy way to predict the amount of memory required, the - * following amount are allocated for CPU and GPU Virtual CRAT. This is + * following amount is allocated for GPU Virtual CRAT. This is * expected to cover all known conditions. But to be safe additional check * is put in the code to ensure we don't overwrite. */ -#define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) #define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE) /* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node @@ -948,7 +964,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) #endif int ret = 0; - if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) + if (!pcrat_image) return -EINVAL; /* Fill in CRAT Header. @@ -1348,30 +1364,37 @@ int kfd_create_crat_image_virtual(void **crat_image, size_t *size, uint32_t proximity_domain) { void *pcrat_image = NULL; - int ret = 0; + int ret = 0, num_nodes; + size_t dyn_size; if (!crat_image) return -EINVAL; *crat_image = NULL; - /* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and - * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover - * all the current conditions. A check is put not to overwrite beyond - * allocated size + /* Allocate the CPU Virtual CRAT size based on the number of online + * nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. + * This should cover all the current conditions. A check is put not + * to overwrite beyond allocated size for GPUs */ switch (flags) { case COMPUTE_UNIT_CPU: - pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL); + num_nodes = num_online_nodes(); + dyn_size = sizeof(struct crat_header) + + num_nodes * (sizeof(struct crat_subtype_computeunit) + + sizeof(struct crat_subtype_memory) + + (num_nodes - 1) * sizeof(struct crat_subtype_iolink)); + pcrat_image = kvmalloc(dyn_size, GFP_KERNEL); if (!pcrat_image) return -ENOMEM; - *size = VCRAT_SIZE_FOR_CPU; + *size = dyn_size; + pr_debug("CRAT size is %ld", dyn_size); ret = kfd_create_vcrat_image_cpu(pcrat_image, size); break; case COMPUTE_UNIT_GPU: if (!kdev) return -EINVAL; - pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); + pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); if (!pcrat_image) return -ENOMEM; *size = VCRAT_SIZE_FOR_GPU; @@ -1390,7 +1413,7 @@ int kfd_create_crat_image_virtual(void **crat_image, size_t *size, if (!ret) *crat_image = pcrat_image; else - kfree(pcrat_image); + kvfree(pcrat_image); return ret; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 4bfedaab183f..903170e59342 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -29,6 +29,7 @@ #include "cwsr_trap_handler.h" #include "kfd_iommu.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" #define MQD_SIZE_ALIGNED 768 @@ -115,6 +116,7 @@ static const struct kfd_device_info carrizo_device_info = { .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; +#endif static const struct kfd_device_info raven_device_info = { .asic_family = CHIP_RAVEN, @@ -133,7 +135,6 @@ static const struct kfd_device_info raven_device_info = { .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; -#endif static const struct kfd_device_info hawaii_device_info = { .asic_family = CHIP_HAWAII, @@ -502,8 +503,8 @@ static const struct kfd_device_info *kfd_supported_devices[][2] = { #ifdef KFD_SUPPORT_IOMMU_V2 [CHIP_KAVERI] = {&kaveri_device_info, NULL}, [CHIP_CARRIZO] = {&carrizo_device_info, NULL}, - [CHIP_RAVEN] = {&raven_device_info, NULL}, #endif + [CHIP_RAVEN] = {&raven_device_info, NULL}, [CHIP_HAWAII] = {&hawaii_device_info, NULL}, [CHIP_TONGA] = {&tonga_device_info, NULL}, [CHIP_FIJI] = {&fiji_device_info, &fiji_vf_device_info}, @@ -582,6 +583,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, atomic_set(&kfd->sram_ecc_flag, 0); + ida_init(&kfd->doorbell_ida); + return kfd; } @@ -711,11 +714,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto kfd_doorbell_error; } - if (kfd->kfd2kgd->get_hive_id) - kfd->hive_id = kfd->kfd2kgd->get_hive_id(kfd->kgd); + kfd->hive_id = amdgpu_amdkfd_get_hive_id(kfd->kgd); + + kfd->unique_id = amdgpu_amdkfd_get_unique_id(kfd->kgd); - if (kfd->kfd2kgd->get_unique_id) - kfd->unique_id = kfd->kfd2kgd->get_unique_id(kfd->kgd); + kfd->noretry = amdgpu_amdkfd_get_noretry(kfd->kgd); if (kfd_interrupt_init(kfd)) { dev_err(kfd_device, "Error initializing interrupts\n"); @@ -737,6 +740,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto gws_error; } + /* If CRAT is broken, won't set iommu enabled */ + kfd_double_confirm_iommu_support(kfd); + if (kfd_iommu_device_init(kfd)) { dev_err(kfd_device, "Error initializing iommuv2\n"); goto device_iommu_error; @@ -796,6 +802,7 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfd_interrupt_exit(kfd); kfd_topology_remove_device(kfd); kfd_doorbell_fini(kfd); + ida_destroy(&kfd->doorbell_ida); kfd_gtt_sa_fini(kfd); amdgpu_amdkfd_free_gtt_mem(kfd->kgd, kfd->gtt_mem); if (kfd->gws) @@ -810,6 +817,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) if (!kfd->init_complete) return 0; + kfd_smi_event_update_gpu_reset(kfd, false); + kfd->dqm->ops.pre_reset(kfd->dqm); kgd2kfd_suspend(kfd, false); @@ -838,6 +847,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) atomic_set(&kfd->sram_ecc_flag, 0); + kfd_smi_event_update_gpu_reset(kfd, true); + return 0; } @@ -1245,6 +1256,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd) WARN_ONCE(count < 0, "Compute profile ref. count error"); } +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) +{ + if (kfd) + kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index a8d316711625..c0ae04a08625 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -153,30 +153,6 @@ static void decrement_queue_count(struct device_queue_manager *dqm, dqm->active_cp_queue_count--; } -int read_sdma_queue_counter(uint64_t q_rptr, uint64_t *val) -{ - int ret; - uint64_t tmp = 0; - - if (!val) - return -EINVAL; - /* - * SDMA activity counter is stored at queue's RPTR + 0x8 location. - */ - if (!access_ok((const void __user *)(q_rptr + - sizeof(uint64_t)), sizeof(uint64_t))) { - pr_err("Can't access sdma queue activity counter\n"); - return -EFAULT; - } - - ret = get_user(tmp, (uint64_t *)(q_rptr + sizeof(uint64_t))); - if (!ret) { - *val = tmp; - } - - return ret; -} - static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) { struct kfd_dev *dev = qpd->dqm->dev; @@ -215,9 +191,8 @@ static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) } q->properties.doorbell_off = - kfd_get_doorbell_dw_offset_in_bar(dev, q->process, + kfd_get_doorbell_dw_offset_in_bar(dev, qpd_to_pdd(qpd), q->doorbell_id); - return 0; } @@ -552,7 +527,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, /* Get the SDMA queue stats */ if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { - retval = read_sdma_queue_counter((uint64_t)q->properties.read_ptr, + retval = read_sdma_queue_counter((uint64_t __user *)q->properties.read_ptr, &sdma_val); if (retval) pr_err("Failed to read SDMA queue counter for queue: %d\n", @@ -674,9 +649,10 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, goto out; pdd = qpd_to_pdd(qpd); - pr_info_ratelimited("Evicting PASID 0x%x queues\n", + pr_debug_ratelimited("Evicting PASID 0x%x queues\n", pdd->process->pasid); + pdd->last_evict_timestamp = get_jiffies_64(); /* Mark all queues as evicted. Deactivate all active queues on * the qpd. */ @@ -724,7 +700,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, goto out; pdd = qpd_to_pdd(qpd); - pr_info_ratelimited("Evicting PASID 0x%x queues\n", + pr_debug_ratelimited("Evicting PASID 0x%x queues\n", pdd->process->pasid); /* Mark all queues as evicted. Deactivate all active queues on @@ -738,6 +714,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, q->properties.is_active = false; decrement_queue_count(dqm, q->properties.type); } + pdd->last_evict_timestamp = get_jiffies_64(); retval = execute_queues_cpsch(dqm, qpd->is_debug ? KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : @@ -756,6 +733,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; uint64_t pd_base; + uint64_t eviction_duration; int retval, ret = 0; pdd = qpd_to_pdd(qpd); @@ -770,7 +748,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, goto out; } - pr_info_ratelimited("Restoring PASID 0x%x queues\n", + pr_debug_ratelimited("Restoring PASID 0x%x queues\n", pdd->process->pasid); /* Update PD Base in QPD */ @@ -823,6 +801,8 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, ret = retval; } qpd->evicted = 0; + eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp; + atomic64_add(eviction_duration, &pdd->evict_duration_counter); out: if (mm) mmput(mm); @@ -836,6 +816,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, struct queue *q; struct kfd_process_device *pdd; uint64_t pd_base; + uint64_t eviction_duration; int retval = 0; pdd = qpd_to_pdd(qpd); @@ -850,7 +831,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, goto out; } - pr_info_ratelimited("Restoring PASID 0x%x queues\n", + pr_debug_ratelimited("Restoring PASID 0x%x queues\n", pdd->process->pasid); /* Update PD Base in QPD */ @@ -869,6 +850,8 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, retval = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); qpd->evicted = 0; + eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp; + atomic64_add(eviction_duration, &pdd->evict_duration_counter); out: dqm_unlock(dqm); return retval; @@ -1475,7 +1458,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, /* Get the SDMA queue stats */ if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { - retval = read_sdma_queue_counter((uint64_t)q->properties.read_ptr, + retval = read_sdma_queue_counter((uint64_t __user *)q->properties.read_ptr, &sdma_val); if (retval) pr_err("Failed to read SDMA queue counter for queue: %d\n", @@ -1989,6 +1972,7 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm, u32 pasid) if (!p) return -EINVAL; + WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); pdd = kfd_get_process_device_data(dqm->dev, p); if (pdd) ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 49d8e324c636..16262e5d93f5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -251,5 +251,11 @@ static inline void dqm_unlock(struct device_queue_manager *dqm) mutex_unlock(&dqm->lock_hidden); } -int read_sdma_queue_counter(uint64_t q_rptr, uint64_t *val); +static inline int read_sdma_queue_counter(uint64_t __user *q_rptr, uint64_t *val) +{ + /* + * SDMA activity counter is stored at queue's RPTR + 0x8 location. + */ + return get_user(*val, q_rptr + 1); +} #endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c index 95a82ac455f2..eca6331efa94 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -61,8 +61,8 @@ static int update_qpd_v9(struct device_queue_manager *dqm, qpd->sh_mem_config = SH_MEM_ALIGNMENT_MODE_UNALIGNED << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; - if (amdgpu_noretry && - !dqm->dev->device_info->needs_iommu_device) + if (dqm->dev->noretry && + !dqm->dev->use_iommu_v2) qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index 8e0c00b9555e..768d153acff4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -31,9 +31,6 @@ * kernel queues using the first doorbell page reserved for the kernel. */ -static DEFINE_IDA(doorbell_ida); -static unsigned int max_doorbell_slices; - /* * Each device exposes a doorbell aperture, a PCI MMIO aperture that * receives 32-bit writes that are passed to queues as wptr values. @@ -84,9 +81,9 @@ int kfd_doorbell_init(struct kfd_dev *kfd) else return -ENOSPC; - if (!max_doorbell_slices || - doorbell_process_limit < max_doorbell_slices) - max_doorbell_slices = doorbell_process_limit; + if (!kfd->max_doorbell_slices || + doorbell_process_limit < kfd->max_doorbell_slices) + kfd->max_doorbell_slices = doorbell_process_limit; kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address + doorbell_start_offset; @@ -130,6 +127,7 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, struct vm_area_struct *vma) { phys_addr_t address; + struct kfd_process_device *pdd; /* * For simplicitly we only allow mapping of the entire doorbell @@ -138,9 +136,12 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) return -EINVAL; - /* Calculate physical address of doorbell */ - address = kfd_get_process_doorbells(dev, process); + pdd = kfd_get_process_device_data(dev, process); + if (!pdd) + return -EINVAL; + /* Calculate physical address of doorbell */ + address = kfd_get_process_doorbells(pdd); vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; @@ -226,7 +227,7 @@ void write_kernel_doorbell64(void __iomem *db, u64 value) } unsigned int kfd_get_doorbell_dw_offset_in_bar(struct kfd_dev *kfd, - struct kfd_process *process, + struct kfd_process_device *pdd, unsigned int doorbell_id) { /* @@ -236,7 +237,7 @@ unsigned int kfd_get_doorbell_dw_offset_in_bar(struct kfd_dev *kfd, * units regardless of the ASIC-dependent doorbell size. */ return kfd->doorbell_base_dw_offset + - process->doorbell_index + pdd->doorbell_index * kfd_doorbell_process_slice(kfd) / sizeof(u32) + doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); } @@ -251,25 +252,24 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd) } -phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, - struct kfd_process *process) +phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd) { - return dev->doorbell_base + - process->doorbell_index * kfd_doorbell_process_slice(dev); + return pdd->dev->doorbell_base + + pdd->doorbell_index * kfd_doorbell_process_slice(pdd->dev); } -int kfd_alloc_process_doorbells(struct kfd_process *process) +int kfd_alloc_process_doorbells(struct kfd_dev *kfd, unsigned int *doorbell_index) { - int r = ida_simple_get(&doorbell_ida, 1, max_doorbell_slices, + int r = ida_simple_get(&kfd->doorbell_ida, 1, kfd->max_doorbell_slices, GFP_KERNEL); if (r > 0) - process->doorbell_index = r; + *doorbell_index = r; return r; } -void kfd_free_process_doorbells(struct kfd_process *process) +void kfd_free_process_doorbells(struct kfd_dev *kfd, unsigned int doorbell_index) { - if (process->doorbell_index) - ida_simple_remove(&doorbell_ida, process->doorbell_index); + if (doorbell_index) + ida_simple_remove(&kfd->doorbell_ida, doorbell_index); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index c1166c40ac15..3c22909470f2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -321,7 +321,7 @@ static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) pdd->lds_base = MAKE_LDS_APP_BASE_VI(); pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); - if (!pdd->dev->device_info->needs_iommu_device) { + if (!pdd->dev->use_iommu_v2) { /* dGPUs: SVM aperture starting at 0 * with small reserved space for kernel. * Set them to CANONICAL addresses. @@ -425,7 +425,7 @@ int kfd_init_apertures(struct kfd_process *process) return -EINVAL; } - if (!dev->device_info->needs_iommu_device) { + if (!dev->use_iommu_v2) { /* dGPUs: the reserved space for kernel * before SVM */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c index e8ef3886688b..66bbca61e3ef 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c @@ -41,7 +41,7 @@ int kfd_iommu_check_device(struct kfd_dev *kfd) struct amd_iommu_device_info iommu_info; int err; - if (!kfd->device_info->needs_iommu_device) + if (!kfd->use_iommu_v2) return -ENODEV; iommu_info.flags = 0; @@ -63,7 +63,7 @@ int kfd_iommu_device_init(struct kfd_dev *kfd) unsigned int pasid_limit; int err; - if (!kfd->device_info->needs_iommu_device) + if (!kfd->use_iommu_v2) return 0; iommu_info.flags = 0; @@ -109,7 +109,7 @@ int kfd_iommu_bind_process_to_device(struct kfd_process_device *pdd) struct kfd_process *p = pdd->process; int err; - if (!dev->device_info->needs_iommu_device || pdd->bound == PDD_BOUND) + if (!dev->use_iommu_v2 || pdd->bound == PDD_BOUND) return 0; if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) { @@ -284,7 +284,7 @@ static void kfd_unbind_processes_from_device(struct kfd_dev *kfd) */ void kfd_iommu_suspend(struct kfd_dev *kfd) { - if (!kfd->device_info->needs_iommu_device) + if (!kfd->use_iommu_v2) return; kfd_unbind_processes_from_device(kfd); @@ -304,7 +304,7 @@ int kfd_iommu_resume(struct kfd_dev *kfd) unsigned int pasid_limit; int err; - if (!kfd->device_info->needs_iommu_device) + if (!kfd->use_iommu_v2) return 0; pasid_limit = kfd_get_pasid_limit(); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index f4b7f7e6c40e..5e90fe642192 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -70,6 +70,7 @@ err_create_wq: err_topology: kfd_chardev_exit(); err_ioctl: + pr_err("KFD is disabled due to module initialization failure\n"); return err; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 922ae138ab85..c77cf23032ac 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -297,6 +297,9 @@ struct kfd_dev { bool pci_atomic_requested; + /* Use IOMMU v2 flag */ + bool use_iommu_v2; + /* SRAM ECC flag */ atomic_t sram_ecc_flag; @@ -309,6 +312,13 @@ struct kfd_dev { /* Clients watching SMI events */ struct list_head smi_clients; spinlock_t smi_lock; + + uint32_t reset_seq_num; + + struct ida doorbell_ida; + unsigned int max_doorbell_slices; + + int noretry; }; enum kfd_mempool { @@ -626,7 +636,7 @@ enum kfd_pdd_bound { PDD_BOUND_SUSPENDED, }; -#define MAX_SYSFS_FILENAME_LEN 11 +#define MAX_SYSFS_FILENAME_LEN 15 /* * SDMA counter runs at 100MHz frequency. @@ -687,6 +697,39 @@ struct kfd_process_device { uint64_t sdma_past_activity_counter; struct attribute attr_sdma; char sdma_filename[MAX_SYSFS_FILENAME_LEN]; + + /* Eviction activity tracking */ + uint64_t last_evict_timestamp; + atomic64_t evict_duration_counter; + struct attribute attr_evict; + + struct kobject *kobj_stats; + unsigned int doorbell_index; + + /* + * @cu_occupancy: Reports occupancy of Compute Units (CU) of a process + * that is associated with device encoded by "this" struct instance. The + * value reflects CU usage by all of the waves launched by this process + * on this device. A very important property of occupancy parameter is + * that its value is a snapshot of current use. + * + * Following is to be noted regarding how this parameter is reported: + * + * The number of waves that a CU can launch is limited by couple of + * parameters. These are encoded by struct amdgpu_cu_info instance + * that is part of every device definition. For GFX9 devices this + * translates to 40 waves (simd_per_cu * max_waves_per_simd) when waves + * do not use scratch memory and 32 waves (max_scratch_slots_per_cu) + * when they do use scratch memory. This could change for future + * devices and therefore this example should be considered as a guide. + * + * All CU's of a device are available for the process. This may not be true + * under certain conditions - e.g. CU masking. + * + * Finally number of CU's that are occupied by a process is affected by both + * number of CU's a device has along with number of other competing processes + */ + struct attribute attr_cu_occupancy; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -724,7 +767,6 @@ struct kfd_process { struct mmu_notifier mmu_notifier; u32 pasid; - unsigned int doorbell_index; /* * List of kfd_process_device structures, @@ -857,13 +899,13 @@ u32 read_kernel_doorbell(u32 __iomem *db); void write_kernel_doorbell(void __iomem *db, u32 value); void write_kernel_doorbell64(void __iomem *db, u64 value); unsigned int kfd_get_doorbell_dw_offset_in_bar(struct kfd_dev *kfd, - struct kfd_process *process, + struct kfd_process_device *pdd, unsigned int doorbell_id); -phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, - struct kfd_process *process); -int kfd_alloc_process_doorbells(struct kfd_process *process); -void kfd_free_process_doorbells(struct kfd_process *process); - +phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd); +int kfd_alloc_process_doorbells(struct kfd_dev *kfd, + unsigned int *doorbell_index); +void kfd_free_process_doorbells(struct kfd_dev *kfd, + unsigned int doorbell_index); /* GTT Sub-Allocator */ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, @@ -892,6 +934,7 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd); int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); int kfd_numa_node_to_apic_id(int numa_node_id); +void kfd_double_confirm_iommu_support(struct kfd_dev *gpu); /* Interrupts */ int kfd_interrupt_init(struct kfd_dev *dev); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 627793029033..65803e153a22 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -87,7 +87,7 @@ struct kfd_sdma_activity_handler_workarea { }; struct temp_sdma_queue_list { - uint64_t rptr; + uint64_t __user *rptr; uint64_t sdma_val; unsigned int queue_id; struct list_head list; @@ -159,7 +159,7 @@ static void kfd_sdma_activity_worker(struct work_struct *work) } INIT_LIST_HEAD(&sdma_q->list); - sdma_q->rptr = (uint64_t)q->properties.read_ptr; + sdma_q->rptr = (uint64_t __user *)q->properties.read_ptr; sdma_q->queue_id = q->properties.queue_id; list_add_tail(&sdma_q->list, &sdma_q_list.list); } @@ -218,7 +218,7 @@ static void kfd_sdma_activity_worker(struct work_struct *work) continue; list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { - if (((uint64_t)q->properties.read_ptr == sdma_q->rptr) && + if (((uint64_t __user *)q->properties.read_ptr == sdma_q->rptr) && (sdma_q->queue_id == q->properties.queue_id)) { list_del(&sdma_q->list); kfree(sdma_q); @@ -249,6 +249,52 @@ cleanup: } } +/** + * @kfd_get_cu_occupancy() - Collect number of waves in-flight on this device + * by current process. Translates acquired wave count into number of compute units + * that are occupied. + * + * @atr: Handle of attribute that allows reporting of wave count. The attribute + * handle encapsulates GPU device it is associated with, thereby allowing collection + * of waves in flight, etc + * + * @buffer: Handle of user provided buffer updated with wave count + * + * Return: Number of bytes written to user buffer or an error value + */ +static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) +{ + int cu_cnt; + int wave_cnt; + int max_waves_per_cu; + struct kfd_dev *dev = NULL; + struct kfd_process *proc = NULL; + struct kfd_process_device *pdd = NULL; + + pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy); + dev = pdd->dev; + if (dev->kfd2kgd->get_cu_occupancy == NULL) + return -EINVAL; + + cu_cnt = 0; + proc = pdd->process; + if (pdd->qpd.queue_count == 0) { + pr_debug("Gpu-Id: %d has no active queues for process %d\n", + dev->id, proc->pasid); + return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); + } + + /* Collect wave count from device if it supports */ + wave_cnt = 0; + max_waves_per_cu = 0; + dev->kfd2kgd->get_cu_occupancy(dev->kgd, proc->pasid, &wave_cnt, + &max_waves_per_cu); + + /* Translate wave count to number of compute units */ + cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu; + return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); +} + static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, char *buffer) { @@ -270,6 +316,7 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, kfd_sdma_activity_worker); sdma_activity_work_handler.pdd = pdd; + sdma_activity_work_handler.sdma_activity_counter = 0; schedule_work(&sdma_activity_work_handler.sdma_activity_work); @@ -344,6 +391,32 @@ static ssize_t kfd_procfs_queue_show(struct kobject *kobj, return 0; } +static ssize_t kfd_procfs_stats_show(struct kobject *kobj, + struct attribute *attr, char *buffer) +{ + if (strcmp(attr->name, "evicted_ms") == 0) { + struct kfd_process_device *pdd = container_of(attr, + struct kfd_process_device, + attr_evict); + uint64_t evict_jiffies; + + evict_jiffies = atomic64_read(&pdd->evict_duration_counter); + + return snprintf(buffer, + PAGE_SIZE, + "%llu\n", + jiffies64_to_msecs(evict_jiffies)); + + /* Sysfs handle that gets CU occupancy is per device */ + } else if (strcmp(attr->name, "cu_occupancy") == 0) { + return kfd_get_cu_occupancy(attr, buffer); + } else { + pr_err("Invalid attribute"); + } + + return 0; +} + static struct attribute attr_queue_size = { .name = "size", .mode = KFD_SYSFS_FILE_MODE @@ -375,6 +448,19 @@ static struct kobj_type procfs_queue_type = { .default_attrs = procfs_queue_attrs, }; +static const struct sysfs_ops procfs_stats_ops = { + .show = kfd_procfs_stats_show, +}; + +static struct attribute *procfs_stats_attrs[] = { + NULL +}; + +static struct kobj_type procfs_stats_type = { + .sysfs_ops = &procfs_stats_ops, + .default_attrs = procfs_stats_attrs, +}; + int kfd_procfs_add_queue(struct queue *q) { struct kfd_process *proc; @@ -416,6 +502,72 @@ static int kfd_sysfs_create_file(struct kfd_process *p, struct attribute *attr, return ret; } +static int kfd_procfs_add_sysfs_stats(struct kfd_process *p) +{ + int ret = 0; + struct kfd_process_device *pdd; + char stats_dir_filename[MAX_SYSFS_FILENAME_LEN]; + + if (!p) + return -EINVAL; + + if (!p->kobj) + return -EFAULT; + + /* + * Create sysfs files for each GPU: + * - proc/<pid>/stats_<gpuid>/ + * - proc/<pid>/stats_<gpuid>/evicted_ms + * - proc/<pid>/stats_<gpuid>/cu_occupancy + */ + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + struct kobject *kobj_stats; + + snprintf(stats_dir_filename, MAX_SYSFS_FILENAME_LEN, + "stats_%u", pdd->dev->id); + kobj_stats = kfd_alloc_struct(kobj_stats); + if (!kobj_stats) + return -ENOMEM; + + ret = kobject_init_and_add(kobj_stats, + &procfs_stats_type, + p->kobj, + stats_dir_filename); + + if (ret) { + pr_warn("Creating KFD proc/stats_%s folder failed", + stats_dir_filename); + kobject_put(kobj_stats); + goto err; + } + + pdd->kobj_stats = kobj_stats; + pdd->attr_evict.name = "evicted_ms"; + pdd->attr_evict.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&pdd->attr_evict); + ret = sysfs_create_file(kobj_stats, &pdd->attr_evict); + if (ret) + pr_warn("Creating eviction stats for gpuid %d failed", + (int)pdd->dev->id); + + /* Add sysfs file to report compute unit occupancy */ + if (pdd->dev->kfd2kgd->get_cu_occupancy != NULL) { + pdd->attr_cu_occupancy.name = "cu_occupancy"; + pdd->attr_cu_occupancy.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&pdd->attr_cu_occupancy); + ret = sysfs_create_file(kobj_stats, + &pdd->attr_cu_occupancy); + if (ret) + pr_warn("Creating %s failed for gpuid: %d", + pdd->attr_cu_occupancy.name, + (int)pdd->dev->id); + } + } +err: + return ret; +} + + static int kfd_procfs_add_sysfs_files(struct kfd_process *p) { int ret = 0; @@ -451,7 +603,6 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p) return ret; } - void kfd_procfs_del_queue(struct queue *q) { if (!q) @@ -659,6 +810,11 @@ struct kfd_process *kfd_create_process(struct file *filep) if (!process->kobj_queues) pr_warn("Creating KFD proc/queues folder failed"); + ret = kfd_procfs_add_sysfs_stats(process); + if (ret) + pr_warn("Creating sysfs stats dir for pid %d failed", + (int)process->lead_thread->pid); + ret = kfd_procfs_add_sysfs_files(process); if (ret) pr_warn("Creating sysfs usage file for pid %d failed", @@ -780,6 +936,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) kfree(pdd->qpd.doorbell_bitmap); idr_destroy(&pdd->alloc_idr); + kfd_free_process_doorbells(pdd->dev, pdd->doorbell_index); + /* * before destroying pdd, make sure to report availability * for auto suspend @@ -815,6 +973,12 @@ static void kfd_process_wq_release(struct work_struct *work) list_for_each_entry(pdd, &p->per_device_data, per_device_list) { sysfs_remove_file(p->kobj, &pdd->attr_vram); sysfs_remove_file(p->kobj, &pdd->attr_sdma); + sysfs_remove_file(p->kobj, &pdd->attr_evict); + if (pdd->dev->kfd2kgd->get_cu_occupancy != NULL) + sysfs_remove_file(p->kobj, &pdd->attr_cu_occupancy); + kobject_del(pdd->kobj_stats); + kobject_put(pdd->kobj_stats); + pdd->kobj_stats = NULL; } kobject_del(p->kobj); @@ -832,8 +996,6 @@ static void kfd_process_wq_release(struct work_struct *work) kfd_event_free_process(p); kfd_pasid_free(p->pasid); - kfd_free_process_doorbells(p); - mutex_destroy(&p->mutex); put_task_struct(p->lead_thread); @@ -1011,9 +1173,6 @@ static struct kfd_process *create_process(const struct task_struct *thread) if (process->pasid == 0) goto err_alloc_pasid; - if (kfd_alloc_process_doorbells(process) < 0) - goto err_alloc_doorbells; - err = pqm_init(&process->pqm, process); if (err != 0) goto err_process_pqm_init; @@ -1041,8 +1200,6 @@ err_register_notifier: err_init_apertures: pqm_uninit(&process->pqm); err_process_pqm_init: - kfd_free_process_doorbells(process); -err_alloc_doorbells: kfd_pasid_free(process->pasid); err_alloc_pasid: mutex_destroy(&process->mutex); @@ -1105,10 +1262,14 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, if (!pdd) return NULL; + if (kfd_alloc_process_doorbells(dev, &pdd->doorbell_index) < 0) { + pr_err("Failed to alloc doorbell for pdd\n"); + goto err_free_pdd; + } + if (init_doorbell_bitmap(&pdd->qpd, dev)) { pr_err("Failed to init doorbell for process\n"); - kfree(pdd); - return NULL; + goto err_free_pdd; } pdd->dev = dev; @@ -1124,12 +1285,17 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, pdd->runtime_inuse = false; pdd->vram_usage = 0; pdd->sdma_past_activity_counter = 0; + atomic64_set(&pdd->evict_duration_counter, 0); list_add(&pdd->per_device_list, &p->per_device_data); /* Init idr used for memory handle translation */ idr_init(&pdd->alloc_idr); return pdd; + +err_free_pdd: + kfree(pdd); + return NULL; } /** @@ -1487,6 +1653,7 @@ void kfd_suspend_all_processes(void) unsigned int temp; int idx = srcu_read_lock(&kfd_processes_srcu); + WARN(debug_evictions, "Evicting all processes"); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { cancel_delayed_work_sync(&p->eviction_work); cancel_delayed_work_sync(&p->restore_work); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 7b348bf9df21..17d1736367ea 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -24,6 +24,7 @@ #include <linux/wait.h> #include <linux/anon_inodes.h> #include <uapi/linux/kfd_ioctl.h> +#include "amdgpu.h" #include "amdgpu_vm.h" #include "kfd_priv.h" #include "kfd_smi_events.h" @@ -148,15 +149,94 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) return 0; } +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event, + char *event_msg, int len) +{ + struct kfd_smi_client *client; + + rcu_read_lock(); + + list_for_each_entry_rcu(client, &dev->smi_clients, list) { + if (!(READ_ONCE(client->events) & + KFD_SMI_EVENT_MASK_FROM_INDEX(smi_event))) + continue; + spin_lock(&client->lock); + if (kfifo_avail(&client->fifo) >= len) { + kfifo_in(&client->fifo, event_msg, len); + wake_up_all(&client->wait_queue); + } else { + pr_debug("smi_event(EventID: %u): no space left\n", + smi_event); + } + spin_unlock(&client->lock); + } + + rcu_read_unlock(); +} + +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset) +{ + /* + * GpuReset msg = Reset seq number (incremented for + * every reset message sent before GPU reset). + * 1 byte event + 1 byte space + 8 bytes seq num + + * 1 byte \n + 1 byte \0 = 12 + */ + char fifo_in[12]; + int len; + unsigned int event; + + if (list_empty(&dev->smi_clients)) + return; + + memset(fifo_in, 0x0, sizeof(fifo_in)); + + if (post_reset) { + event = KFD_SMI_EVENT_GPU_POST_RESET; + } else { + event = KFD_SMI_EVENT_GPU_PRE_RESET; + ++(dev->reset_seq_num); + } + + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x\n", event, + dev->reset_seq_num); + + add_event_to_kfifo(dev, event, fifo_in, len); +} + +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, + uint32_t throttle_bitmask) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; + /* + * ThermalThrottle msg = throttle_bitmask(8): + * thermal_interrupt_count(16): + * 1 byte event + 1 byte space + 8 byte throttle_bitmask + + * 1 byte : + 16 byte thermal_interupt_counter + 1 byte \n + + * 1 byte \0 = 29 + */ + char fifo_in[29]; + int len; + + if (list_empty(&dev->smi_clients)) + return; + + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%llx\n", + KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask, + atomic64_read(&adev->smu.throttle_int_counter)); + + add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len); +} + void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) { struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; struct amdgpu_task_info task_info; /* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */ - /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43 + /* 1 byte event + 1 byte space + 25 bytes msg + 1 byte \n + + * 1 byte \0 = 29 */ - char fifo_in[43]; - struct kfd_smi_client *client; + char fifo_in[29]; int len; if (list_empty(&dev->smi_clients)) @@ -168,25 +248,10 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) if (!task_info.pid) return; - len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, task_info.pid, task_info.task_name); - rcu_read_lock(); - - list_for_each_entry_rcu(client, &dev->smi_clients, list) { - if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT)) - continue; - spin_lock(&client->lock); - if (kfifo_avail(&client->fifo) >= len) { - kfifo_in(&client->fifo, fifo_in, len); - wake_up_all(&client->wait_queue); - } - else - pr_debug("smi_event(vmfault): no space left\n"); - spin_unlock(&client->lock); - } - - rcu_read_unlock(); + add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len); } int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index a9cb218fef96..b9b0438202e2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -25,5 +25,8 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, + uint32_t throttle_bitmask); +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset); #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index f185f6cbc05c..2b31c3066aaa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -446,7 +446,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, sysfs_show_32bit_prop(buffer, offs, "cpu_cores_count", dev->node_props.cpu_cores_count); sysfs_show_32bit_prop(buffer, offs, "simd_count", - dev->node_props.simd_count); + dev->gpu ? dev->node_props.simd_count : 0); sysfs_show_32bit_prop(buffer, offs, "mem_banks_count", dev->node_props.mem_banks_count); sysfs_show_32bit_prop(buffer, offs, "caches_count", @@ -1139,7 +1139,7 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) /* Discrete GPUs need their own topology device list * entries. Don't assign them to CPU/APU nodes. */ - if (!gpu->device_info->needs_iommu_device && + if (!gpu->use_iommu_v2 && dev->node_props.cpu_cores_count) continue; @@ -1239,7 +1239,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) void *crat_image = NULL; size_t image_size = 0; int proximity_domain; - struct amdgpu_ras *ctx; + struct amdgpu_device *adev; INIT_LIST_HEAD(&temp_topology_device_list); @@ -1388,7 +1388,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) * Overwrite ATS capability according to needs_iommu_device to fix * potential missing corresponding bit in CRAT of BIOS. */ - if (dev->gpu->device_info->needs_iommu_device) + if (dev->gpu->use_iommu_v2) dev->node_props.capability |= HSA_CAP_ATS_PRESENT; else dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT; @@ -1404,19 +1404,17 @@ int kfd_topology_add_device(struct kfd_dev *gpu) dev->node_props.max_waves_per_simd = 10; } - ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu->kgd)); - if (ctx) { - /* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on UMC */ - dev->node_props.capability |= - (((ctx->features & BIT(AMDGPU_RAS_BLOCK__SDMA)) != 0) || - ((ctx->features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0)) ? - HSA_CAP_SRAM_EDCSUPPORTED : 0; - dev->node_props.capability |= ((ctx->features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ? - HSA_CAP_MEM_EDCSUPPORTED : 0; - - dev->node_props.capability |= (ctx->features != 0) ? + adev = (struct amdgpu_device *)(dev->gpu->kgd); + /* kfd only concerns sram ecc on GFX and HBM ecc on UMC */ + dev->node_props.capability |= + ((adev->ras_features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ? + HSA_CAP_SRAM_EDCSUPPORTED : 0; + dev->node_props.capability |= ((adev->ras_features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ? + HSA_CAP_MEM_EDCSUPPORTED : 0; + + if (adev->asic_type != CHIP_VEGA10) + dev->node_props.capability |= (adev->ras_features != 0) ? HSA_CAP_RASEVENTNOTIFY : 0; - } kfd_debug_print_topology(); @@ -1515,6 +1513,29 @@ int kfd_numa_node_to_apic_id(int numa_node_id) return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id)); } +void kfd_double_confirm_iommu_support(struct kfd_dev *gpu) +{ + struct kfd_topology_device *dev; + + gpu->use_iommu_v2 = false; + + if (!gpu->device_info->needs_iommu_device) + return; + + down_read(&topology_lock); + + /* Only use IOMMUv2 if there is an APU topology node with no GPU + * assigned yet. This GPU will be assigned to it. + */ + list_for_each_entry(dev, &topology_device_list, list) + if (dev->node_props.cpu_cores_count && + dev->node_props.simd_count && + !dev->gpu) + gpu->use_iommu_v2 = true; + + up_read(&topology_lock); +} + #if defined(CONFIG_DEBUG_FS) int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) |