summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_topology.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.c371
1 files changed, 299 insertions, 72 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3f0a4a415907..bceb1a5b2518 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -115,7 +115,7 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
down_read(&topology_lock);
list_for_each_entry(top_dev, &topology_device_list, list)
- if (top_dev->gpu && top_dev->gpu->pdev == pdev) {
+ if (top_dev->gpu && top_dev->gpu->adev->pdev == pdev) {
device = top_dev->gpu;
break;
}
@@ -364,7 +364,6 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
/* Making sure that the buffer is an empty string */
buffer[0] = 0;
-
cache = container_of(attr, struct kfd_cache_properties, attr);
if (cache->gpu && kfd_devcgroup_check_permission(cache->gpu))
return -EPERM;
@@ -379,12 +378,13 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
sysfs_show_32bit_prop(buffer, offs, "association", cache->cache_assoc);
sysfs_show_32bit_prop(buffer, offs, "latency", cache->cache_latency);
sysfs_show_32bit_prop(buffer, offs, "type", cache->cache_type);
+
offs += snprintf(buffer+offs, PAGE_SIZE-offs, "sibling_map ");
- for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
+ for (i = 0; i < cache->sibling_map_size; i++)
for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++)
/* Check each bit */
offs += snprintf(buffer+offs, PAGE_SIZE-offs, "%d,",
- (cache->sibling_map[i] >> j) & 1);
+ (cache->sibling_map[i] >> j) & 1);
/* Replace the last "," with end of line */
buffer[offs-1] = '\n';
@@ -1169,13 +1169,12 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
local_mem_size = gpu->local_mem_info.local_mem_size_private +
gpu->local_mem_info.local_mem_size_public;
-
- buf[0] = gpu->pdev->devfn;
- buf[1] = gpu->pdev->subsystem_vendor |
- (gpu->pdev->subsystem_device << 16);
- buf[2] = pci_domain_nr(gpu->pdev->bus);
- buf[3] = gpu->pdev->device;
- buf[4] = gpu->pdev->bus->number;
+ buf[0] = gpu->adev->pdev->devfn;
+ buf[1] = gpu->adev->pdev->subsystem_vendor |
+ (gpu->adev->pdev->subsystem_device << 16);
+ buf[2] = pci_domain_nr(gpu->adev->pdev->bus);
+ buf[3] = gpu->adev->pdev->device;
+ buf[4] = gpu->adev->pdev->bus->number;
buf[5] = lower_32_bits(local_mem_size);
buf[6] = upper_32_bits(local_mem_size);
@@ -1198,7 +1197,6 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
struct kfd_iolink_properties *iolink;
struct kfd_iolink_properties *p2plink;
- down_write(&topology_lock);
list_for_each_entry(dev, &topology_device_list, list) {
/* Discrete GPUs need their own topology device list
* entries. Don't assign them to CPU/APU nodes.
@@ -1222,7 +1220,6 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
break;
}
}
- up_write(&topology_lock);
return out_dev;
}
@@ -1269,7 +1266,7 @@ static void kfd_set_iolink_no_atomics(struct kfd_topology_device *dev,
if (target_gpu_dev) {
uint32_t cap;
- pcie_capability_read_dword(target_gpu_dev->gpu->pdev,
+ pcie_capability_read_dword(target_gpu_dev->gpu->adev->pdev,
PCI_EXP_DEVCAP2, &cap);
if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
@@ -1593,21 +1590,290 @@ out:
return ret;
}
+
+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext,
+ struct kfd_gpu_cache_info *pcache_info,
+ struct kfd_cu_info *cu_info,
+ int cu_bitmask,
+ int cache_type, unsigned int cu_processor_id,
+ int cu_block)
+{
+ unsigned int cu_sibling_map_mask;
+ int first_active_cu;
+ struct kfd_cache_properties *pcache = NULL;
+
+ cu_sibling_map_mask = cu_bitmask;
+ cu_sibling_map_mask >>= cu_block;
+ cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+ first_active_cu = ffs(cu_sibling_map_mask);
+
+ /* CU could be inactive. In case of shared cache find the first active
+ * CU. and incase of non-shared cache check if the CU is inactive. If
+ * inactive active skip it
+ */
+ if (first_active_cu) {
+ pcache = kfd_alloc_struct(pcache);
+ if (!pcache)
+ return -ENOMEM;
+
+ memset(pcache, 0, sizeof(struct kfd_cache_properties));
+ pcache->processor_id_low = cu_processor_id + (first_active_cu - 1);
+ pcache->cache_level = pcache_info[cache_type].cache_level;
+ pcache->cache_size = pcache_info[cache_type].cache_size;
+
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_DATA;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_CPU;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_HSACU;
+
+ /* Sibling map is w.r.t processor_id_low, so shift out
+ * inactive CU
+ */
+ cu_sibling_map_mask =
+ cu_sibling_map_mask >> (first_active_cu - 1);
+
+ pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+ pcache->sibling_map[1] =
+ (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+ pcache->sibling_map[2] =
+ (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+ pcache->sibling_map[3] =
+ (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+
+ pcache->sibling_map_size = 4;
+ *props_ext = pcache;
+
+ return 0;
+ }
+ return 1;
+}
+
+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
+ struct kfd_gpu_cache_info *pcache_info,
+ struct kfd_cu_info *cu_info,
+ int cache_type, unsigned int cu_processor_id)
+{
+ unsigned int cu_sibling_map_mask;
+ int first_active_cu;
+ int i, j, k;
+ struct kfd_cache_properties *pcache = NULL;
+
+ cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
+ cu_sibling_map_mask &=
+ ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+ first_active_cu = ffs(cu_sibling_map_mask);
+
+ /* CU could be inactive. In case of shared cache find the first active
+ * CU. and incase of non-shared cache check if the CU is inactive. If
+ * inactive active skip it
+ */
+ if (first_active_cu) {
+ pcache = kfd_alloc_struct(pcache);
+ if (!pcache)
+ return -ENOMEM;
+
+ memset(pcache, 0, sizeof(struct kfd_cache_properties));
+ pcache->processor_id_low = cu_processor_id
+ + (first_active_cu - 1);
+ pcache->cache_level = pcache_info[cache_type].cache_level;
+ pcache->cache_size = pcache_info[cache_type].cache_size;
+
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_DATA;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_CPU;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_HSACU;
+
+ /* Sibling map is w.r.t processor_id_low, so shift out
+ * inactive CU
+ */
+ cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1);
+ k = 0;
+
+ for (i = 0; i < cu_info->num_shader_engines; i++) {
+ for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
+ pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+ pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+ pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+ pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+ k += 4;
+
+ cu_sibling_map_mask = cu_info->cu_bitmap[i % 4][j + i / 4];
+ cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+ }
+ }
+ pcache->sibling_map_size = k;
+ *props_ext = pcache;
+ return 0;
+ }
+ return 1;
+}
+
+#define KFD_MAX_CACHE_TYPES 6
+
+/* kfd_fill_cache_non_crat_info - Fill GPU cache info using kfd_gpu_cache_info
+ * tables
+ */
+static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct kfd_dev *kdev)
+{
+ struct kfd_gpu_cache_info *pcache_info = NULL;
+ int i, j, k;
+ int ct = 0;
+ unsigned int cu_processor_id;
+ int ret;
+ unsigned int num_cu_shared;
+ struct kfd_cu_info cu_info;
+ struct kfd_cu_info *pcu_info;
+ int gpu_processor_id;
+ struct kfd_cache_properties *props_ext;
+ int num_of_entries = 0;
+ int num_of_cache_types = 0;
+ struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
+
+ amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
+ pcu_info = &cu_info;
+
+ gpu_processor_id = dev->node_props.simd_id_base;
+
+ pcache_info = cache_info;
+ num_of_cache_types = kfd_get_gpu_cache_info(kdev, &pcache_info);
+ if (!num_of_cache_types) {
+ pr_warn("no cache info found\n");
+ return;
+ }
+
+ /* For each type of cache listed in the kfd_gpu_cache_info table,
+ * go through all available Compute Units.
+ * The [i,j,k] loop will
+ * if kfd_gpu_cache_info.num_cu_shared = 1
+ * will parse through all available CU
+ * If (kfd_gpu_cache_info.num_cu_shared != 1)
+ * then it will consider only one CU from
+ * the shared unit
+ */
+ for (ct = 0; ct < num_of_cache_types; ct++) {
+ cu_processor_id = gpu_processor_id;
+ if (pcache_info[ct].cache_level == 1) {
+ for (i = 0; i < pcu_info->num_shader_engines; i++) {
+ for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
+ for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
+
+ ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
+ pcu_info->cu_bitmap[i % 4][j + i / 4], ct,
+ cu_processor_id, k);
+
+ if (ret < 0)
+ break;
+
+ if (!ret) {
+ num_of_entries++;
+ list_add_tail(&props_ext->list, &dev->cache_props);
+ }
+
+ /* Move to next CU block */
+ num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
+ pcu_info->num_cu_per_sh) ?
+ pcache_info[ct].num_cu_shared :
+ (pcu_info->num_cu_per_sh - k);
+ cu_processor_id += num_cu_shared;
+ }
+ }
+ }
+ } else {
+ ret = fill_in_l2_l3_pcache(&props_ext, pcache_info,
+ pcu_info, ct, cu_processor_id);
+
+ if (ret < 0)
+ break;
+
+ if (!ret) {
+ num_of_entries++;
+ list_add_tail(&props_ext->list, &dev->cache_props);
+ }
+ }
+ }
+ dev->node_props.caches_count += num_of_entries;
+ pr_debug("Added [%d] GPU cache entries\n", num_of_entries);
+}
+
+static int kfd_topology_add_device_locked(struct kfd_dev *gpu, uint32_t gpu_id,
+ struct kfd_topology_device **dev)
+{
+ int proximity_domain = ++topology_crat_proximity_domain;
+ struct list_head temp_topology_device_list;
+ void *crat_image = NULL;
+ size_t image_size = 0;
+ int res;
+
+ res = kfd_create_crat_image_virtual(&crat_image, &image_size,
+ COMPUTE_UNIT_GPU, gpu,
+ proximity_domain);
+ if (res) {
+ pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
+ gpu_id);
+ topology_crat_proximity_domain--;
+ goto err;
+ }
+
+ INIT_LIST_HEAD(&temp_topology_device_list);
+
+ res = kfd_parse_crat_table(crat_image,
+ &temp_topology_device_list,
+ proximity_domain);
+ if (res) {
+ pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
+ gpu_id);
+ topology_crat_proximity_domain--;
+ goto err;
+ }
+
+ kfd_topology_update_device_list(&temp_topology_device_list,
+ &topology_device_list);
+
+ *dev = kfd_assign_gpu(gpu);
+ if (WARN_ON(!*dev)) {
+ res = -ENODEV;
+ goto err;
+ }
+
+ /* Fill the cache affinity information here for the GPUs
+ * using VCRAT
+ */
+ kfd_fill_cache_non_crat_info(*dev, gpu);
+
+ /* Update the SYSFS tree, since we added another topology
+ * device
+ */
+ res = kfd_topology_update_sysfs();
+ if (!res)
+ sys_props.generation_count++;
+ else
+ pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
+ gpu_id, res);
+
+err:
+ kfd_destroy_crat_image(crat_image);
+ return res;
+}
+
int kfd_topology_add_device(struct kfd_dev *gpu)
{
uint32_t gpu_id;
struct kfd_topology_device *dev;
struct kfd_cu_info cu_info;
int res = 0;
- struct list_head temp_topology_device_list;
- void *crat_image = NULL;
- size_t image_size = 0;
- int proximity_domain;
int i;
const char *asic_name = amdgpu_asic_name[gpu->adev->asic_type];
- INIT_LIST_HEAD(&temp_topology_device_list);
-
gpu_id = kfd_generate_gpu_id(gpu);
pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
@@ -1617,50 +1883,13 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
* CRAT to create a new topology device. Once created assign the gpu to
* that topology device
*/
+ down_write(&topology_lock);
dev = kfd_assign_gpu(gpu);
- if (!dev) {
- down_write(&topology_lock);
- proximity_domain = ++topology_crat_proximity_domain;
-
- res = kfd_create_crat_image_virtual(&crat_image, &image_size,
- COMPUTE_UNIT_GPU, gpu,
- proximity_domain);
- if (res) {
- pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
- gpu_id);
- topology_crat_proximity_domain--;
- return res;
- }
- res = kfd_parse_crat_table(crat_image,
- &temp_topology_device_list,
- proximity_domain);
- if (res) {
- pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
- gpu_id);
- topology_crat_proximity_domain--;
- goto err;
- }
-
- kfd_topology_update_device_list(&temp_topology_device_list,
- &topology_device_list);
-
- /* Update the SYSFS tree, since we added another topology
- * device
- */
- res = kfd_topology_update_sysfs();
- up_write(&topology_lock);
-
- if (!res)
- sys_props.generation_count++;
- else
- pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
- gpu_id, res);
- dev = kfd_assign_gpu(gpu);
- if (WARN_ON(!dev)) {
- res = -ENODEV;
- goto err;
- }
- }
+ if (!dev)
+ res = kfd_topology_add_device_locked(gpu, gpu_id, &dev);
+ up_write(&topology_lock);
+ if (res)
+ return res;
dev->gpu_id = gpu_id;
gpu->id = gpu_id;
@@ -1688,13 +1917,13 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
cu_info.num_shader_arrays_per_engine;
dev->node_props.gfx_target_version = gpu->device_info.gfx_target_version;
- dev->node_props.vendor_id = gpu->pdev->vendor;
- dev->node_props.device_id = gpu->pdev->device;
+ dev->node_props.vendor_id = gpu->adev->pdev->vendor;
+ dev->node_props.device_id = gpu->adev->pdev->device;
dev->node_props.capability |=
((dev->gpu->adev->rev_id << HSA_CAP_ASIC_REVISION_SHIFT) &
HSA_CAP_ASIC_REVISION_MASK);
- dev->node_props.location_id = pci_dev_id(gpu->pdev);
- dev->node_props.domain = pci_domain_nr(gpu->pdev->bus);
+ dev->node_props.location_id = pci_dev_id(gpu->adev->pdev);
+ dev->node_props.domain = pci_domain_nr(gpu->adev->pdev->bus);
dev->node_props.max_engine_clk_fcompute =
amdgpu_amdkfd_get_max_engine_clock_in_mhz(dev->gpu->adev);
dev->node_props.max_engine_clk_ccompute =
@@ -1783,11 +2012,9 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
kfd_debug_print_topology();
- if (!res)
- kfd_notify_gpu_change(gpu_id, 1);
-err:
- kfd_destroy_crat_image(crat_image);
- return res;
+ kfd_notify_gpu_change(gpu_id, 1);
+
+ return 0;
}
/**