diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 485 | 
1 files changed, 336 insertions, 149 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1fb2a91ad30a..b0d2fc9454ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -99,6 +99,49 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)  	return false;  } +static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) +{ +	struct ras_err_data err_data = {0, 0, 0, NULL}; +	struct eeprom_table_record err_rec; + +	if ((address >= adev->gmc.mc_vram_size) || +	    (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { +		dev_warn(adev->dev, +		         "RAS WARN: input address 0x%llx is invalid.\n", +		         address); +		return -EINVAL; +	} + +	if (amdgpu_ras_check_bad_page(adev, address)) { +		dev_warn(adev->dev, +			 "RAS WARN: 0x%llx has already been marked as bad page!\n", +			 address); +		return 0; +	} + +	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); + +	err_rec.address = address; +	err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT; +	err_rec.ts = (uint64_t)ktime_get_real_seconds(); +	err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; + +	err_data.err_addr = &err_rec; +	err_data.err_addr_cnt = 1; + +	if (amdgpu_bad_page_threshold != 0) { +		amdgpu_ras_add_bad_pages(adev, err_data.err_addr, +					 err_data.err_addr_cnt); +		amdgpu_ras_save_bad_pages(adev); +	} + +	dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); +	dev_warn(adev->dev, "Clear EEPROM:\n"); +	dev_warn(adev->dev, "    echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); + +	return 0; +} +  static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,  					size_t size, loff_t *pos)  { @@ -109,7 +152,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,  	ssize_t s;  	char val[128]; -	if (amdgpu_ras_error_query(obj->adev, &info)) +	if (amdgpu_ras_query_error_status(obj->adev, &info))  		return -EINVAL;  	s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", @@ -178,11 +221,24 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,  		op = 1;  	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)  		op = 2; +	else if (strstr(str, "retire_page") != NULL) +		op = 3;  	else if (str[0] && str[1] && str[2] && str[3])  		/* ascii string, but commands are not matched. */  		return -EINVAL;  	if (op != -1) { +		if (op == 3) { +			if (sscanf(str, "%*s 0x%llx", &address) != 1 && +			    sscanf(str, "%*s %llu", &address) != 1) +				return -EINVAL; + +			data->op = op; +			data->inject.address = address; + +			return 0; +		} +  		if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))  			return -EINVAL; @@ -198,11 +254,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,  		data->op = op;  		if (op == 2) { -			if (sscanf(str, "%*s %*s %*s %u %llu %llu", -						&sub_block, &address, &value) != 3) -				if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", -							&sub_block, &address, &value) != 3) -					return -EINVAL; +			if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx", +				   &sub_block, &address, &value) != 3 && +			    sscanf(str, "%*s %*s %*s %u %llu %llu", +				   &sub_block, &address, &value) != 3) +				return -EINVAL;  			data->head.sub_block_index = sub_block;  			data->inject.address = address;  			data->inject.value = value; @@ -221,7 +277,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,  /**   * DOC: AMDGPU RAS debugfs control interface   * - * It accepts struct ras_debug_if who has two members. + * The control interface accepts struct ras_debug_if which has two members.   *   * First member: ras_debug_if::head or ras_debug_if::inject.   * @@ -246,32 +302,33 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,   *   * How to use the interface?   * - * Programs + * In a program   * - * Copy the struct ras_debug_if in your codes and initialize it. - * Write the struct to the control node. + * Copy the struct ras_debug_if in your code and initialize it. + * Write the struct to the control interface.   * - * Shells + * From shell   *   * .. code-block:: bash   * - *	echo op block [error [sub_block address value]] > .../ras/ras_ctrl + *	echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl + *	echo "enable  <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl + *	echo "inject  <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl   * - * Parameters: + * Where N, is the card which you want to affect.   * - * op: disable, enable, inject - *	disable: only block is needed - *	enable: block and error are needed - *	inject: error, address, value are needed - * block: umc, sdma, gfx, ......... + * "disable" requires only the block. + * "enable" requires the block and error type. + * "inject" requires the block, error type, address, and value. + * The block is one of: umc, sdma, gfx, etc.   *	see ras_block_string[] for details - * error: ue, ce - *	ue: multi_uncorrectable - *	ce: single_correctable - * sub_block: - *	sub block index, pass 0 if there is no sub block + * The error type is one of: ue, ce, where, + *	ue is multi-uncorrectable + *	ce is single-correctable + * The sub-block is a the sub-block index, pass 0 if there is no sub-block. + * The address and value are hexadecimal numbers, leading 0x is optional.   * - * here are some examples for bash commands: + * For instance,   *   * .. code-block:: bash   * @@ -279,17 +336,17 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,   *	echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl   *	echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl   * - * How to check the result? + * How to check the result of the operation?   * - * For disable/enable, please check ras features at + * To check disable/enable, see "ras" features at,   * /sys/class/drm/card[0/1/2...]/device/ras/features   * - * For inject, please check corresponding err count at - * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count + * To check inject, see the corresponding error count at, + * /sys/class/drm/card[0/1/2...]/device/ras/[gfx|sdma|umc|...]_err_count   *   * .. note::   *	Operations are only allowed on blocks which are supported. - *	Please check ras mask at /sys/module/amdgpu/parameters/ras_mask + *	Check the "ras" mask at /sys/module/amdgpu/parameters/ras_mask   *	to see which blocks support RAS on a particular asic.   *   */ @@ -310,6 +367,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *  	if (ret)  		return -EINVAL; +	if (data.op == 3) { +		ret = amdgpu_reserve_page_direct(adev, data.inject.address); +		if (!ret) +			return size; +		else +			return ret; +	} +  	if (!amdgpu_ras_is_supported(adev, data.head.block))  		return -EINVAL; @@ -431,15 +496,19 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,  	};  	if (!amdgpu_ras_get_error_query_ready(obj->adev)) -		return snprintf(buf, PAGE_SIZE, -				"Query currently inaccessible\n"); +		return sysfs_emit(buf, "Query currently inaccessible\n"); -	if (amdgpu_ras_error_query(obj->adev, &info)) +	if (amdgpu_ras_query_error_status(obj->adev, &info))  		return -EINVAL; -	return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", -			"ue", info.ue_count, -			"ce", info.ce_count); + +	if (obj->adev->asic_type == CHIP_ALDEBARAN) { +		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) +			DRM_WARN("Failed to reset error counter and error status"); +	} + +	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, +			  "ce", info.ce_count);  }  /* obj begin */ @@ -449,11 +518,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,  static inline void put_obj(struct ras_manager *obj)  { -	if (obj && --obj->use == 0) +	if (obj && (--obj->use == 0))  		list_del(&obj->node); -	if (obj && obj->use < 0) { -		 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); -	} +	if (obj && (obj->use < 0)) +		DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);  }  /* make one obj and return it. */ @@ -463,7 +531,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj; -	if (!con) +	if (!adev->ras_features || !con)  		return NULL;  	if (head->block >= AMDGPU_RAS_BLOCK_COUNT) @@ -490,7 +558,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,  	struct ras_manager *obj;  	int i; -	if (!con) +	if (!adev->ras_features || !con)  		return NULL;  	if (head) { @@ -590,7 +658,11 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,  		con->features |= BIT(head->block);  	} else {  		if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { -			con->features &= ~BIT(head->block); +			/* skip clean gfx ras context feature for VEGA20 Gaming. +			 * will clean later +			 */ +			if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX))) +				con->features &= ~BIT(head->block);  			put_obj(obj);  		}  	} @@ -693,6 +765,10 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,  			if (ret)  				return ret; +			/* gfx block ras dsiable cmd must send to ras-ta */ +			if (head->block == AMDGPU_RAS_BLOCK__GFX) +				con->features |= BIT(head->block); +  			ret = amdgpu_ras_feature_enable(adev, head, 0);  		}  	} else @@ -757,8 +833,8 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,  /* feature ctl end */  /* query/inject/cure begin */ -int amdgpu_ras_error_query(struct amdgpu_device *adev, -		struct ras_query_if *info) +int amdgpu_ras_query_error_status(struct amdgpu_device *adev, +	struct ras_query_if *info)  {  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);  	struct ras_err_data err_data = {0, 0, 0, NULL}; @@ -769,13 +845,15 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,  	switch (info->head.block) {  	case AMDGPU_RAS_BLOCK__UMC: -		if (adev->umc.funcs->query_ras_error_count) -			adev->umc.funcs->query_ras_error_count(adev, &err_data); +		if (adev->umc.ras_funcs && +		    adev->umc.ras_funcs->query_ras_error_count) +			adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);  		/* umc query_ras_error_address is also responsible for clearing  		 * error status  		 */ -		if (adev->umc.funcs->query_ras_error_address) -			adev->umc.funcs->query_ras_error_address(adev, &err_data); +		if (adev->umc.ras_funcs && +		    adev->umc.ras_funcs->query_ras_error_address) +			adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);  		break;  	case AMDGPU_RAS_BLOCK__SDMA:  		if (adev->sdma.funcs->query_ras_error_count) { @@ -785,19 +863,32 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,  		}  		break;  	case AMDGPU_RAS_BLOCK__GFX: -		if (adev->gfx.funcs->query_ras_error_count) -			adev->gfx.funcs->query_ras_error_count(adev, &err_data); +		if (adev->gfx.ras_funcs && +		    adev->gfx.ras_funcs->query_ras_error_count) +			adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data); + +		if (adev->gfx.ras_funcs && +		    adev->gfx.ras_funcs->query_ras_error_status) +			adev->gfx.ras_funcs->query_ras_error_status(adev);  		break;  	case AMDGPU_RAS_BLOCK__MMHUB: -		if (adev->mmhub.funcs->query_ras_error_count) -			adev->mmhub.funcs->query_ras_error_count(adev, &err_data); +		if (adev->mmhub.ras_funcs && +		    adev->mmhub.ras_funcs->query_ras_error_count) +			adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data); + +		if (adev->mmhub.ras_funcs && +		    adev->mmhub.ras_funcs->query_ras_error_status) +			adev->mmhub.ras_funcs->query_ras_error_status(adev);  		break;  	case AMDGPU_RAS_BLOCK__PCIE_BIF: -		if (adev->nbio.funcs->query_ras_error_count) -			adev->nbio.funcs->query_ras_error_count(adev, &err_data); +		if (adev->nbio.ras_funcs && +		    adev->nbio.ras_funcs->query_ras_error_count) +			adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);  		break;  	case AMDGPU_RAS_BLOCK__XGMI_WAFL: -		amdgpu_xgmi_query_ras_error_count(adev, &err_data); +		if (adev->gmc.xgmi.ras_funcs && +		    adev->gmc.xgmi.ras_funcs->query_ras_error_count) +			adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data);  		break;  	default:  		break; @@ -826,6 +917,38 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,  	return 0;  } +int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, +		enum amdgpu_ras_block block) +{ +	if (!amdgpu_ras_is_supported(adev, block)) +		return -EINVAL; + +	switch (block) { +	case AMDGPU_RAS_BLOCK__GFX: +		if (adev->gfx.ras_funcs && +		    adev->gfx.ras_funcs->reset_ras_error_count) +			adev->gfx.ras_funcs->reset_ras_error_count(adev); + +		if (adev->gfx.ras_funcs && +		    adev->gfx.ras_funcs->reset_ras_error_status) +			adev->gfx.ras_funcs->reset_ras_error_status(adev); +		break; +	case AMDGPU_RAS_BLOCK__MMHUB: +		if (adev->mmhub.ras_funcs && +		    adev->mmhub.ras_funcs->reset_ras_error_count) +			adev->mmhub.ras_funcs->reset_ras_error_count(adev); +		break; +	case AMDGPU_RAS_BLOCK__SDMA: +		if (adev->sdma.funcs->reset_ras_error_count) +			adev->sdma.funcs->reset_ras_error_count(adev); +		break; +	default: +		break; +	} + +	return 0; +} +  /* Trigger XGMI/WAFL error */  static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,  				 struct ta_ras_trigger_error_input *block_info) @@ -878,12 +1001,14 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,  	switch (info->head.block) {  	case AMDGPU_RAS_BLOCK__GFX: -		if (adev->gfx.funcs->ras_error_inject) -			ret = adev->gfx.funcs->ras_error_inject(adev, info); +		if (adev->gfx.ras_funcs && +		    adev->gfx.ras_funcs->ras_error_inject) +			ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);  		else  			ret = -EINVAL;  		break;  	case AMDGPU_RAS_BLOCK__UMC: +	case AMDGPU_RAS_BLOCK__SDMA:  	case AMDGPU_RAS_BLOCK__MMHUB:  	case AMDGPU_RAS_BLOCK__PCIE_BIF:  		ret = psp_ras_trigger_error(&adev->psp, &block_info); @@ -913,7 +1038,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,  	struct ras_manager *obj;  	struct ras_err_data data = {0, 0}; -	if (!con) +	if (!adev->ras_features || !con)  		return 0;  	list_for_each_entry(obj, &con->head, node) { @@ -921,7 +1046,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,  			.head = obj->head,  		}; -		if (amdgpu_ras_error_query(adev, &info)) +		if (amdgpu_ras_query_error_status(adev, &info))  			return 0;  		data.ce_count += info.ce_count; @@ -1137,16 +1262,19 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)   *   */  /* debugfs begin */ -static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) +static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct dentry *dir;  	struct drm_minor *minor = adev_to_drm(adev)->primary; -	con->dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); -	debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, -				adev, &amdgpu_ras_debugfs_ctrl_ops); -	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, -				adev, &amdgpu_ras_debugfs_eeprom_ops); +	dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); +	debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, +			    &amdgpu_ras_debugfs_ctrl_ops); +	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, +			    &amdgpu_ras_debugfs_eeprom_ops); +	debugfs_create_u32("bad_page_cnt_threshold", 0444, dir, +			   &con->bad_page_cnt_threshold);  	/*  	 * After one uncorrectable error happens, usually GPU recovery will @@ -1156,24 +1284,24 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)  	 * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine  	 * will never be called.  	 */ -	debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir, -				&con->reboot); +	debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);  	/*  	 * User could set this not to clean up hardware's error count register  	 * of RAS IPs during ras recovery.  	 */ -	debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, -			con->dir, &con->disable_ras_err_cnt_harvest); +	debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir, +			    &con->disable_ras_err_cnt_harvest); +	return dir;  }  static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, -		struct ras_fs_if *head) +				      struct ras_fs_if *head, +				      struct dentry *dir)  { -	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); -	if (!obj || obj->ent) +	if (!obj || !dir)  		return;  	get_obj(obj); @@ -1182,14 +1310,14 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,  			head->debugfs_name,  			sizeof(obj->fs_data.debugfs_name)); -	obj->ent = debugfs_create_file(obj->fs_data.debugfs_name, -				       S_IWUGO | S_IRUGO, con->dir, obj, -				       &amdgpu_ras_debugfs_ops); +	debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, +			    obj, &amdgpu_ras_debugfs_ops);  }  void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct dentry *dir;  	struct ras_manager *obj;  	struct ras_fs_if fs_info; @@ -1200,7 +1328,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  	if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)  		return; -	amdgpu_ras_debugfs_create_ctrl_node(adev); +	dir = amdgpu_ras_debugfs_create_ctrl_node(adev);  	list_for_each_entry(obj, &con->head, node) {  		if (amdgpu_ras_is_supported(adev, obj->head.block) && @@ -1208,34 +1336,11 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  			sprintf(fs_info.debugfs_name, "%s_err_inject",  					ras_block_str(obj->head.block));  			fs_info.head = obj->head; -			amdgpu_ras_debugfs_create(adev, &fs_info); +			amdgpu_ras_debugfs_create(adev, &fs_info, dir);  		}  	}  } -static void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, -		struct ras_common_if *head) -{ -	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); - -	if (!obj || !obj->ent) -		return; - -	obj->ent = NULL; -	put_obj(obj); -} - -static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) -{ -	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	struct ras_manager *obj, *tmp; - -	list_for_each_entry_safe(obj, tmp, &con->head, node) { -		amdgpu_ras_debugfs_remove(adev, &obj->head); -	} - -	con->dir = NULL; -}  /* debugfs end */  /* ras fs */ @@ -1282,8 +1387,17 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)  { -	if (IS_ENABLED(CONFIG_DEBUG_FS)) -		amdgpu_ras_debugfs_remove_all(adev); +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct ras_manager *con_obj, *ip_obj, *tmp; + +	if (IS_ENABLED(CONFIG_DEBUG_FS)) { +		list_for_each_entry_safe(con_obj, tmp, &con->head, node) { +			ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); +			if (ip_obj) +				put_obj(ip_obj); +		} +	} +  	amdgpu_ras_sysfs_remove_all(adev);  	return 0;  } @@ -1447,7 +1561,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj; -	if (!con) +	if (!adev->ras_features || !con)  		return;  	list_for_each_entry(obj, &con->head, node) { @@ -1464,7 +1578,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)  		if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)  			continue; -		amdgpu_ras_error_query(adev, &info); +		amdgpu_ras_query_error_status(adev, &info);  	}  } @@ -1478,12 +1592,14 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,  	 */  	switch (info->head.block) {  	case AMDGPU_RAS_BLOCK__GFX: -		if (adev->gfx.funcs->query_ras_error_status) -			adev->gfx.funcs->query_ras_error_status(adev); +		if (adev->gfx.ras_funcs && +		    adev->gfx.ras_funcs->query_ras_error_status) +			adev->gfx.ras_funcs->query_ras_error_status(adev);  		break;  	case AMDGPU_RAS_BLOCK__MMHUB: -		if (adev->mmhub.funcs->query_ras_error_status) -			adev->mmhub.funcs->query_ras_error_status(adev); +		if (adev->mmhub.ras_funcs && +		    adev->mmhub.ras_funcs->query_ras_error_status) +			adev->mmhub.ras_funcs->query_ras_error_status(adev);  		break;  	default:  		break; @@ -1495,7 +1611,7 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj; -	if (!con) +	if (!adev->ras_features || !con)  		return;  	list_for_each_entry(obj, &con->head, node) { @@ -1809,7 +1925,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  	bool exc_err_limit = false;  	int ret; -	if (con) +	if (adev->ras_features && con)  		data = &con->eh_data;  	else  		return 0; @@ -1828,6 +1944,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  	max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();  	amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); +	/* Todo: During test the SMU might fail to read the eeprom through I2C +	 * when the GPU is pending on XGMI reset during probe time +	 * (Mostly after second bus reset), skip it now +	 */ +	if (adev->gmc.xgmi.pending_reset) +		return 0;  	ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);  	/*  	 * This calling fails when exc_err_limit is true or @@ -1897,15 +2019,13 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,  	return 0;  } -static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev) +static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  { -	if (adev->asic_type != CHIP_VEGA10 && -		adev->asic_type != CHIP_VEGA20 && -		adev->asic_type != CHIP_ARCTURUS && -		adev->asic_type != CHIP_SIENNA_CICHLID) -		return 1; -	else -		return 0; +	return adev->asic_type == CHIP_VEGA10 || +		adev->asic_type == CHIP_VEGA20 || +		adev->asic_type == CHIP_ARCTURUS || +		adev->asic_type == CHIP_ALDEBARAN || +		adev->asic_type == CHIP_SIENNA_CICHLID;  }  /* @@ -1924,22 +2044,32 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,  	*supported = 0;  	if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || -		amdgpu_ras_check_asic_type(adev)) +	    !amdgpu_ras_asic_supported(adev))  		return; -	if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { -		dev_info(adev->dev, "HBM ECC is active.\n"); -		*hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | -				1 << AMDGPU_RAS_BLOCK__DF); -	} else -		dev_info(adev->dev, "HBM ECC is not presented.\n"); +	if (!adev->gmc.xgmi.connected_to_cpu) { +		if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { +			dev_info(adev->dev, "MEM ECC is active.\n"); +			*hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | +					1 << AMDGPU_RAS_BLOCK__DF); +		} else { +			dev_info(adev->dev, "MEM ECC is not presented.\n"); +		} -	if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { -		dev_info(adev->dev, "SRAM ECC is active.\n"); -		*hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | -				1 << AMDGPU_RAS_BLOCK__DF); -	} else -		dev_info(adev->dev, "SRAM ECC is not presented.\n"); +		if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { +			dev_info(adev->dev, "SRAM ECC is active.\n"); +			*hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | +					1 << AMDGPU_RAS_BLOCK__DF); +		} else { +			dev_info(adev->dev, "SRAM ECC is not presented.\n"); +		} +	} else { +		/* driver only manages a few IP blocks RAS feature +		 * when GPU is connected cpu through XGMI */ +		*hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | +				1 << AMDGPU_RAS_BLOCK__SDMA | +				1 << AMDGPU_RAS_BLOCK__MMHUB); +	}  	/* hw_supported needs to be aligned with RAS block mask. */  	*hw_supported &= AMDGPU_RAS_BLOCK_MASK; @@ -1970,6 +2100,15 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	amdgpu_ras_check_supported(adev, &con->hw_supported,  			&con->supported);  	if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) { +		/* set gfx block ras context feature for VEGA20 Gaming +		 * send ras disable cmd to ras ta during ras late init. +		 */ +		if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) { +			con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); + +			return 0; +		} +  		r = 0;  		goto release_con;  	} @@ -1979,14 +2118,31 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	/* Might need get this flag from vbios. */  	con->flags = RAS_DEFAULT_FLAGS; -	if (adev->nbio.funcs->init_ras_controller_interrupt) { -		r = adev->nbio.funcs->init_ras_controller_interrupt(adev); +	/* initialize nbio ras function ahead of any other +	 * ras functions so hardware fatal error interrupt +	 * can be enabled as early as possible */ +	switch (adev->asic_type) { +	case CHIP_VEGA20: +	case CHIP_ARCTURUS: +	case CHIP_ALDEBARAN: +		if (!adev->gmc.xgmi.connected_to_cpu) +			adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; +		break; +	default: +		/* nbio ras is not available */ +		break; +	} + +	if (adev->nbio.ras_funcs && +	    adev->nbio.ras_funcs->init_ras_controller_interrupt) { +		r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev);  		if (r)  			goto release_con;  	} -	if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) { -		r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev); +	if (adev->nbio.ras_funcs && +	    adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { +		r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev);  		if (r)  			goto release_con;  	} @@ -2007,6 +2163,32 @@ release_con:  	return r;  } +static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) +{ +	if (adev->gmc.xgmi.connected_to_cpu) +		return 1; +	return 0; +} + +static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, +					struct ras_common_if *ras_block) +{ +	struct ras_query_if info = { +		.head = *ras_block, +	}; + +	if (!amdgpu_persistent_edc_harvesting_supported(adev)) +		return 0; + +	if (amdgpu_ras_query_error_status(adev, &info) != 0) +		DRM_WARN("RAS init harvest failure"); + +	if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) +		DRM_WARN("RAS init harvest reset failure"); + +	return 0; +} +  /* helper function to handle common stuff in ip late init phase */  int amdgpu_ras_late_init(struct amdgpu_device *adev,  			 struct ras_common_if *ras_block, @@ -2036,6 +2218,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,  			return r;  	} +	/* check for errors on warm reset edc persisant supported ASIC */ +	amdgpu_persistent_edc_harvesting(adev, ras_block); +  	/* in resume phase, no need to create ras fs node */  	if (adev->in_suspend || amdgpu_in_reset(adev))  		return 0; @@ -2083,8 +2268,12 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj, *tmp; -	if (!con) +	if (!adev->ras_features || !con) { +		/* clean ras context for VEGA20 Gaming after send ras disable cmd */ +		amdgpu_release_ras_context(adev); +  		return; +	}  	if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {  		/* Set up all other IPs which are not implemented. There is a @@ -2125,7 +2314,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	if (!con) +	if (!adev->ras_features || !con)  		return;  	amdgpu_ras_disable_all_features(adev, 0); @@ -2139,7 +2328,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	if (!con) +	if (!adev->ras_features || !con)  		return 0;  	/* Need disable ras on all IPs here before ip [hw/sw]fini */ @@ -2152,7 +2341,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	if (!con) +	if (!adev->ras_features || !con)  		return 0;  	amdgpu_ras_fs_fini(adev); @@ -2196,18 +2385,16 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)  	return false;  } -bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev) +void amdgpu_release_ras_context(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	bool exc_err_limit = false; -	if (con && (amdgpu_bad_page_threshold != 0)) -		amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control, -						&exc_err_limit); +	if (!con) +		return; -	/* -	 * We are only interested in variable exc_err_limit, -	 * as it says if GPU is in bad state or not. -	 */ -	return exc_err_limit; +	if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { +		con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); +		amdgpu_ras_set_context(adev, NULL); +		kfree(con); +	}  } | 
