diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 191 |
1 files changed, 53 insertions, 138 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4e36551ab50b..c136bd449744 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -80,6 +80,8 @@ enum amdgpu_ras_retire_page_reservation { atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); +static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, + uint64_t addr); static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); @@ -516,9 +518,9 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, /* obj end */ static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, - const char* invoke_type, - const char* block_name, - enum ta_ras_status ret) + const char* invoke_type, + const char* block_name, + enum ta_ras_status ret) { switch (ret) { case TA_RAS_STATUS__SUCCESS: @@ -607,7 +609,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!con) return -EINVAL; - info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); + info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); if (!info) return -ENOMEM; @@ -903,13 +905,6 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, return ret; } -int amdgpu_ras_error_cure(struct amdgpu_device *adev, - struct ras_cure_if *info) -{ - /* psp fw has no cure interface for now. */ - return 0; -} - /* get the total error counts on all IPs */ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, bool is_ce) @@ -953,7 +948,7 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags) case AMDGPU_RAS_RETIRE_PAGE_FAULT: default: return "F"; - }; + } } /** @@ -1172,7 +1167,7 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) con->dir, &con->disable_ras_err_cnt_harvest); } -void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, +static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, struct ras_fs_if *head) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1194,7 +1189,6 @@ void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) { -#if defined(CONFIG_DEBUG_FS) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; struct ras_fs_if fs_info; @@ -1203,7 +1197,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) * it won't be called in resume path, no need to check * suspend and gpu reset status */ - if (!con) + if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con) return; amdgpu_ras_debugfs_create_ctrl_node(adev); @@ -1217,10 +1211,9 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) amdgpu_ras_debugfs_create(adev, &fs_info); } } -#endif } -void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, +static void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, struct ras_common_if *head) { struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); @@ -1234,7 +1227,6 @@ void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) { -#if defined(CONFIG_DEBUG_FS) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj, *tmp; @@ -1243,7 +1235,6 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) } con->dir = NULL; -#endif } /* debugfs end */ @@ -1291,7 +1282,8 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) { - amdgpu_ras_debugfs_remove_all(adev); + if (IS_ENABLED(CONFIG_DEBUG_FS)) + amdgpu_ras_debugfs_remove_all(adev); amdgpu_ras_sysfs_remove_all(adev); return 0; } @@ -1477,8 +1469,8 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) } /* Parse RdRspStatus and WrRspStatus */ -void amdgpu_ras_error_status_query(struct amdgpu_device *adev, - struct ras_query_if *info) +static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, + struct ras_query_if *info) { /* * Only two block need to query read/write @@ -1551,10 +1543,12 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, .size = AMDGPU_GPU_PAGE_SIZE, .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, }; - - if (data->last_reserved <= i) + ret = amdgpu_vram_mgr_query_page_status( + ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), + data->bps[i].retired_page); + if (ret == -EBUSY) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; - else if (data->bps_bo[i] == NULL) + else if (ret == -ENOENT) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; } @@ -1606,12 +1600,9 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, unsigned int new_space = old_space + pages; unsigned int align_space = ALIGN(new_space, 512); void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); - struct amdgpu_bo **bps_bo = - kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL); - if (!bps || !bps_bo) { + if (!bps) { kfree(bps); - kfree(bps_bo); return -ENOMEM; } @@ -1620,14 +1611,8 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, data->count * sizeof(*data->bps)); kfree(data->bps); } - if (data->bps_bo) { - memcpy(bps_bo, data->bps_bo, - data->count * sizeof(*data->bps_bo)); - kfree(data->bps_bo); - } data->bps = bps; - data->bps_bo = bps_bo; data->space_left += align_space - old_space; return 0; } @@ -1639,6 +1624,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; int ret = 0; + uint32_t i; if (!con || !con->eh_data || !bps || pages <= 0) return 0; @@ -1648,16 +1634,26 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, if (!data) goto out; - if (data->space_left <= pages) - if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { + for (i = 0; i < pages; i++) { + if (amdgpu_ras_check_bad_page_unlock(con, + bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) + continue; + + if (!data->space_left && + amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { ret = -ENOMEM; goto out; } - memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); - data->count += pages; - data->space_left -= pages; + amdgpu_vram_mgr_reserve_range( + ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), + bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT, + AMDGPU_GPU_PAGE_SIZE); + memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); + data->count++; + data->space_left--; + } out: mutex_unlock(&con->recovery_lock); @@ -1668,7 +1664,7 @@ out: * write error record array to eeprom, the function should be * protected by recovery_lock */ -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) +int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; @@ -1730,6 +1726,20 @@ out: return ret; } +static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, + uint64_t addr) +{ + struct ras_err_handler_data *data = con->eh_data; + int i; + + addr >>= AMDGPU_GPU_PAGE_SHIFT; + for (i = 0; i < data->count; i++) + if (addr == data->bps[i].retired_page) + return true; + + return false; +} + /* * check if an address belongs to bad page * @@ -1739,26 +1749,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data *data; - int i; bool ret = false; if (!con || !con->eh_data) return ret; mutex_lock(&con->recovery_lock); - data = con->eh_data; - if (!data) - goto out; - - addr >>= AMDGPU_GPU_PAGE_SHIFT; - for (i = 0; i < data->count; i++) - if (addr == data->bps[i].retired_page) { - ret = true; - goto out; - } - -out: + ret = amdgpu_ras_check_bad_page_unlock(con, addr); mutex_unlock(&con->recovery_lock); return ret; } @@ -1804,80 +1801,6 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, } } -/* called in gpu recovery/init */ -int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data *data; - uint64_t bp; - struct amdgpu_bo *bo = NULL; - int i, ret = 0; - - /* Not reserve bad page when amdgpu_bad_page_threshold == 0. */ - if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0)) - return 0; - - mutex_lock(&con->recovery_lock); - data = con->eh_data; - if (!data) - goto out; - /* reserve vram at driver post stage. */ - for (i = data->last_reserved; i < data->count; i++) { - bp = data->bps[i].retired_page; - - /* There are two cases of reserve error should be ignored: - * 1) a ras bad page has been allocated (used by someone); - * 2) a ras bad page has been reserved (duplicate error injection - * for one page); - */ - if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT, - AMDGPU_GPU_PAGE_SIZE, - AMDGPU_GEM_DOMAIN_VRAM, - &bo, NULL)) - dev_warn(adev->dev, "RAS WARN: reserve vram for " - "retired page %llx fail\n", bp); - - data->bps_bo[i] = bo; - data->last_reserved = i + 1; - bo = NULL; - } - - /* continue to save bad pages to eeprom even reesrve_vram fails */ - ret = amdgpu_ras_save_bad_pages(adev); -out: - mutex_unlock(&con->recovery_lock); - return ret; -} - -/* called when driver unload */ -static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) -{ - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data *data; - struct amdgpu_bo *bo; - int i; - - if (!con || !con->eh_data) - return 0; - - mutex_lock(&con->recovery_lock); - data = con->eh_data; - if (!data) - goto out; - - for (i = data->last_reserved - 1; i >= 0; i--) { - bo = data->bps_bo[i]; - - amdgpu_bo_free_kernel(&bo, NULL, NULL); - - data->bps_bo[i] = bo; - data->last_reserved = i; - } -out: - mutex_unlock(&con->recovery_lock); - return 0; -} - int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1917,18 +1840,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ret = amdgpu_ras_load_bad_pages(adev); if (ret) goto free; - ret = amdgpu_ras_reserve_bad_pages(adev); - if (ret) - goto release; } return 0; -release: - amdgpu_ras_release_bad_pages(adev); free: kfree((*data)->bps); - kfree((*data)->bps_bo); kfree(*data); con->eh_data = NULL; out: @@ -1956,12 +1873,10 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) return 0; cancel_work_sync(&con->recovery_work); - amdgpu_ras_release_bad_pages(adev); mutex_lock(&con->recovery_lock); con->eh_data = NULL; kfree(data->bps); - kfree(data->bps_bo); kfree(data); mutex_unlock(&con->recovery_lock); @@ -2156,7 +2071,7 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev, amdgpu_ras_sysfs_remove(adev, ras_block); if (ih_info->cb) - amdgpu_ras_interrupt_remove_handler(adev, ih_info); + amdgpu_ras_interrupt_remove_handler(adev, ih_info); amdgpu_ras_feature_enable(adev, ras_block, 0); } |