summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2025-02-26 09:41:44 +0300
committerDave Airlie <airlied@redhat.com>2025-02-26 09:41:44 +0300
commit425b8481750abce45fa4aeecf6c32152cadbfa15 (patch)
treed97104ee978be5c85923cdf3071f54f63b04b805 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parentfb51bf02551958265b7116f6ba92752295c83c26 (diff)
parent3521276ad14fe47ce1c4382749f3c95762629375 (diff)
downloadlinux-425b8481750abce45fa4aeecf6c32152cadbfa15.tar.xz
Merge tag 'amd-drm-next-6.15-2025-02-21' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.15-2025-02-21: amdgpu: - Add OEM i2c support for RGB lights, etc. - Add support for GC 11.5.3 - Add support for GC 11.5.2 - Add support for SDMA 6.1.3 - Add support for NBIO 7.11.2 - Add support for NBIO 7.9.1 - Add support for MMHUB 3.3.2 - Add support for MMHUB 1.8.1 - Add support for SMU 14.0.5 - Add support for SMUIO 13.0.11 - Add support for PSP 14.0.5 - Add support for UMC 12.5.0 - Add support for DCN 3.6.0 - JPEG 4.0.3 updates - Add dynamic workload profile switching for GC 10-12 - support larger vbios sizes - GC 9.5.0 updates - SMU 13.0.12 updates - SMU 13.0.6 updates - IP discovery updates - GC 10 queue reset updates - DCN 4.0.1 updates - UHBR link rate fixes - Aborted suspend fix - Mark gttsize parameter as deprecated - GC 10 cleaner shader updates - PSR-SU fixes - Clean up PM4 headers - Cursor fixes - Enable devcoredump for JPEG - Misc cleanups - Runpm cleanups - MES updates - GC 9 gfxoff fixes - Vbios fetching cleanups - Documentation updates - Update secondary plane handling - DML2 updates - SDMA fixes for MI - Cleaner shader fixes for GC 11/12 - ACA updates - Initial JPEG queue reset support - RAS updates - Initial RAS CPER support - DCN/DCE panic screen handling cleanup - BT2020 fixes - SR-IOV fixes amdkfd: - synchronize pasid values between KGD and KFD - Misc cleanups - Improve GTT/VRAM handling for APUs - Topology updates - Fix user queue validation on GC 7/8 UAPI: - Enable "Broadcast RGB" drm property - Add INFO IOCTL query for virtualization mode Proposed userspace: https://github.com/ROCm/amdsmi/commit/e663bed7d6b3df79f5959e73981749b1f22ec698 From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20250221213651.4176031-1-alexander.deucher@amd.com Signed-off-by: Dave Airlie <airlied@redhat.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c48
1 files changed, 26 insertions, 22 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f0924aa3f4e4..3c3312bbfee8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1864,6 +1864,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
if (!obj || obj->attr_inuse)
return -EINVAL;
+ if (amdgpu_sriov_vf(adev) && !amdgpu_virt_ras_telemetry_block_en(adev, head->block))
+ return 0;
+
get_obj(obj);
snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
@@ -3080,31 +3083,29 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
/*
- * Justification of value bad_page_cnt_threshold in ras structure
- *
- * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
- * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
- * scenarios accordingly.
- *
- * Bad page retirement enablement:
- * - If amdgpu_bad_page_threshold = -2,
- * bad_page_cnt_threshold = typical value by formula.
- *
- * - When the value from user is 0 < amdgpu_bad_page_threshold <
- * max record length in eeprom, use it directly.
- *
- * Bad page retirement disablement:
- * - If amdgpu_bad_page_threshold = 0, bad page retirement
- * functionality is disabled, and bad_page_cnt_threshold will
- * take no effect.
+ * amdgpu_bad_page_threshold is used to config
+ * the threshold for the number of bad pages.
+ * -1: Threshold is set to default value
+ * Driver will issue a warning message when threshold is reached
+ * and continue runtime services.
+ * 0: Disable bad page retirement
+ * Driver will not retire bad pages
+ * which is intended for debugging purpose.
+ * -2: Threshold is determined by a formula
+ * that assumes 1 bad page per 100M of local memory.
+ * Driver will continue runtime services when threhold is reached.
+ * 0 < threshold < max number of bad page records in EEPROM,
+ * A user-defined threshold is set
+ * Driver will halt runtime services when this custom threshold is reached.
*/
-
- if (amdgpu_bad_page_threshold < 0) {
+ if (amdgpu_bad_page_threshold == -2) {
u64 val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_COVER);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_count);
+ } else if (amdgpu_bad_page_threshold == -1) {
+ con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4;
} else {
con->bad_page_cnt_threshold = min_t(int, max_count,
amdgpu_bad_page_threshold);
@@ -3759,8 +3760,9 @@ init_ras_enabled_flag:
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
- /* aca is disabled by default */
- adev->aca.is_enabled = false;
+ /* aca is disabled by default except for psp v13_0_12 */
+ adev->aca.is_enabled =
+ (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12));
/* bad page feature is not applicable to specific app platform */
if (adev->gmc.is_app_apu &&
@@ -3848,8 +3850,10 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 12):
+ con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
+ break;
case IP_VERSION(13, 0, 14):
- con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+ con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1);
break;
default:
break;