diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 221 |
1 files changed, 119 insertions, 102 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index e3a3755cb999..1162913c8bf4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -35,11 +35,9 @@ static DEFINE_MUTEX(xgmi_mutex); -#define AMDGPU_MAX_XGMI_HIVE 8 #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 -static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE]; -static unsigned hive_count = 0; +static LIST_HEAD(xgmi_hive_list); static const int xgmi_pcs_err_status_reg_vg20[] = { smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, @@ -171,65 +169,53 @@ static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { * */ +static struct attribute amdgpu_xgmi_hive_id = { + .name = "xgmi_hive_id", + .mode = S_IRUGO +}; -static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct amdgpu_hive_info *hive = - container_of(attr, struct amdgpu_hive_info, dev_attr); - - return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); -} +static struct attribute *amdgpu_xgmi_hive_attrs[] = { + &amdgpu_xgmi_hive_id, + NULL +}; -static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive) +static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj, + struct attribute *attr, char *buf) { - int ret = 0; + struct amdgpu_hive_info *hive = container_of( + kobj, struct amdgpu_hive_info, kobj); - if (WARN_ON(hive->kobj)) - return -EINVAL; - - hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj); - if (!hive->kobj) { - dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n"); - return -EINVAL; - } - - hive->dev_attr = (struct device_attribute) { - .attr = { - .name = "xgmi_hive_id", - .mode = S_IRUGO, - - }, - .show = amdgpu_xgmi_show_hive_id, - }; + if (attr == &amdgpu_xgmi_hive_id) + return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id); - ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr); - if (ret) { - dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n"); - kobject_del(hive->kobj); - kobject_put(hive->kobj); - hive->kobj = NULL; - } - - return ret; + return 0; } -static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev, - struct amdgpu_hive_info *hive) +static void amdgpu_xgmi_hive_release(struct kobject *kobj) { - sysfs_remove_file(hive->kobj, &hive->dev_attr.attr); - kobject_del(hive->kobj); - kobject_put(hive->kobj); - hive->kobj = NULL; + struct amdgpu_hive_info *hive = container_of( + kobj, struct amdgpu_hive_info, kobj); + + mutex_destroy(&hive->hive_lock); + kfree(hive); } +static const struct sysfs_ops amdgpu_xgmi_hive_ops = { + .show = amdgpu_xgmi_show_attrs, +}; + +struct kobj_type amdgpu_xgmi_hive_type = { + .release = amdgpu_xgmi_hive_release, + .sysfs_ops = &amdgpu_xgmi_hive_ops, + .default_attrs = amdgpu_xgmi_hive_attrs, +}; + static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, struct device_attribute *attr, char *buf) { struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = ddev->dev_private; + struct amdgpu_device *adev = drm_to_adev(ddev); return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id); @@ -241,7 +227,7 @@ static ssize_t amdgpu_xgmi_show_error(struct device *dev, char *buf) { struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = ddev->dev_private; + struct amdgpu_device *adev = drm_to_adev(ddev); uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; uint64_t fica_out; unsigned int error_count = 0; @@ -287,8 +273,8 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, /* Create sysfs link to hive info folder on the first device */ - if (adev != hive->adev) { - ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, + if (hive->kobj.parent != (&adev->dev->kobj)) { + ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj, "xgmi_hive_info"); if (ret) { dev_err(adev->dev, "XGMI: Failed to create link to hive info"); @@ -296,9 +282,9 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, } } - sprintf(node, "node%d", hive->number_devices); + sprintf(node, "node%d", atomic_read(&hive->number_devices)); /* Create sysfs link form the hive folder to yourself */ - ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node); + ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node); if (ret) { dev_err(adev->dev, "XGMI: Failed to create link from hive info"); goto remove_link; @@ -308,7 +294,7 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, remove_link: - sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); + sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique); remove_file: device_remove_file(adev->dev, &dev_attr_xgmi_device_id); @@ -326,78 +312,96 @@ static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, device_remove_file(adev->dev, &dev_attr_xgmi_device_id); device_remove_file(adev->dev, &dev_attr_xgmi_error); - if (adev != hive->adev) + if (hive->kobj.parent != (&adev->dev->kobj)) sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info"); - sprintf(node, "node%d", hive->number_devices); - sysfs_remove_link(hive->kobj, node); + sprintf(node, "node%d", atomic_read(&hive->number_devices)); + sysfs_remove_link(&hive->kobj, node); } -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock) +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) { - int i; - struct amdgpu_hive_info *tmp; + struct amdgpu_hive_info *hive = NULL, *tmp = NULL; + int ret; if (!adev->gmc.xgmi.hive_id) return NULL; + if (adev->hive) { + kobject_get(&adev->hive->kobj); + return adev->hive; + } + mutex_lock(&xgmi_mutex); - for (i = 0 ; i < hive_count; ++i) { - tmp = &xgmi_hives[i]; - if (tmp->hive_id == adev->gmc.xgmi.hive_id) { - if (lock) - mutex_lock(&tmp->hive_lock); - mutex_unlock(&xgmi_mutex); - return tmp; + if (!list_empty(&xgmi_hive_list)) { + list_for_each_entry_safe(hive, tmp, &xgmi_hive_list, node) { + if (hive->hive_id == adev->gmc.xgmi.hive_id) + goto pro_end; } } - if (i >= AMDGPU_MAX_XGMI_HIVE) { - mutex_unlock(&xgmi_mutex); - return NULL; + + hive = kzalloc(sizeof(*hive), GFP_KERNEL); + if (!hive) { + dev_err(adev->dev, "XGMI: allocation failed\n"); + hive = NULL; + goto pro_end; } /* initialize new hive if not exist */ - tmp = &xgmi_hives[hive_count++]; - - if (amdgpu_xgmi_sysfs_create(adev, tmp)) { - mutex_unlock(&xgmi_mutex); - return NULL; + ret = kobject_init_and_add(&hive->kobj, + &amdgpu_xgmi_hive_type, + &adev->dev->kobj, + "%s", "xgmi_hive_info"); + if (ret) { + dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n"); + kfree(hive); + hive = NULL; + goto pro_end; } - tmp->adev = adev; - tmp->hive_id = adev->gmc.xgmi.hive_id; - INIT_LIST_HEAD(&tmp->device_list); - mutex_init(&tmp->hive_lock); - mutex_init(&tmp->reset_lock); - task_barrier_init(&tmp->tb); - - if (lock) - mutex_lock(&tmp->hive_lock); - tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; - tmp->hi_req_gpu = NULL; + hive->hive_id = adev->gmc.xgmi.hive_id; + INIT_LIST_HEAD(&hive->device_list); + INIT_LIST_HEAD(&hive->node); + mutex_init(&hive->hive_lock); + atomic_set(&hive->in_reset, 0); + atomic_set(&hive->number_devices, 0); + task_barrier_init(&hive->tb); + hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; + hive->hi_req_gpu = NULL; /* * hive pstate on boot is high in vega20 so we have to go to low * pstate on after boot. */ - tmp->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; + hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; + list_add_tail(&hive->node, &xgmi_hive_list); + +pro_end: + if (hive) + kobject_get(&hive->kobj); mutex_unlock(&xgmi_mutex); + return hive; +} - return tmp; +void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive) +{ + if (hive) + kobject_put(&hive->kobj); } int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) { int ret = 0; - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); struct amdgpu_device *request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; + amdgpu_put_xgmi_hive(hive); /* fw bug so temporarily disable pstate switching */ return 0; @@ -449,7 +453,7 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_dev /* Each psp need to set the latest topology */ ret = psp_xgmi_set_topology_info(&adev->psp, - hive->number_devices, + atomic_read(&hive->number_devices), &adev->psp.xgmi_context.top_info); if (ret) dev_err(adev->dev, @@ -511,7 +515,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; } - hive = amdgpu_get_xgmi_hive(adev, 1); + hive = amdgpu_get_xgmi_hive(adev); if (!hive) { ret = -EINVAL; dev_err(adev->dev, @@ -519,6 +523,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); goto exit; } + mutex_lock(&hive->hive_lock); top_info = &adev->psp.xgmi_context.top_info; @@ -526,7 +531,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) list_for_each_entry(entry, &hive->device_list, head) top_info->nodes[count++].node_id = entry->node_id; top_info->num_nodes = count; - hive->number_devices = count; + atomic_set(&hive->number_devices, count); task_barrier_add_task(&hive->tb); @@ -541,7 +546,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) } ret = amdgpu_xgmi_update_topology(hive, tmp_adev); if (ret) - goto exit; + goto exit_unlock; } /* get latest topology info for each device from psp */ @@ -554,7 +559,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) tmp_adev->gmc.xgmi.node_id, tmp_adev->gmc.xgmi.hive_id, ret); /* To do : continue with some node failed or disable the whole hive */ - goto exit; + goto exit_unlock; } } } @@ -562,39 +567,51 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) if (!ret) ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); - +exit_unlock: mutex_unlock(&hive->hive_lock); exit: - if (!ret) + if (!ret) { + adev->hive = hive; dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n", adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); - else + } else { + amdgpu_put_xgmi_hive(hive); dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n", adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, ret); + } return ret; } int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) { - struct amdgpu_hive_info *hive; + struct amdgpu_hive_info *hive = adev->hive; if (!adev->gmc.xgmi.supported) return -EINVAL; - hive = amdgpu_get_xgmi_hive(adev, 1); if (!hive) return -EINVAL; + mutex_lock(&hive->hive_lock); task_barrier_rem_task(&hive->tb); amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); + if (hive->hi_req_gpu == adev) + hive->hi_req_gpu = NULL; + list_del(&adev->gmc.xgmi.head); mutex_unlock(&hive->hive_lock); - if(!(--hive->number_devices)){ - amdgpu_xgmi_sysfs_destroy(adev, hive); - mutex_destroy(&hive->hive_lock); - mutex_destroy(&hive->reset_lock); + amdgpu_put_xgmi_hive(hive); + adev->hive = NULL; + + if (atomic_dec_return(&hive->number_devices) == 0) { + /* Remove the hive from global hive list */ + mutex_lock(&xgmi_mutex); + list_del(&hive->node); + mutex_unlock(&xgmi_mutex); + + amdgpu_put_xgmi_hive(hive); } return psp_xgmi_terminate(&adev->psp); |