From d35617ef465b935503e4616c1ce35775a6bc1b71 Mon Sep 17 00:00:00 2001 From: Ce Sun Date: Mon, 18 May 2026 16:44:06 +0800 Subject: drm/amdgpu: Fix user-triggerable BUG()/BUG_ON() calls Replace BUG()/BUG_ON() with error logs and safe returns in several places where they can be triggered by invalid userspace input, preventing DoS via kernel panic. Signed-off-by: Ce Sun Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_reg_access.c | 14 ++++++++++---- drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 2 +- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 2 +- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d2d70c4b2ac5..1dddfde91c49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -717,7 +717,12 @@ void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, if (!drm_dev_enter(adev_to_drm(adev), &idx)) return; - BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); + if (!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)) { + dev_err(adev->dev, "unaligned pos/size (pos=0x%llx, size=0x%zx)\n", + pos, size); + drm_dev_exit(idx); + return; + } spin_lock_irqsave(&adev->mmio_idx_lock, flags); for (last = pos + size; pos < last; pos += 4) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reg_access.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reg_access.c index daefbeeee4d2..7468855c16a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reg_access.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reg_access.c @@ -406,7 +406,10 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) if (offset < adev->rmmio_size) return (readb(adev->rmmio + offset)); - BUG(); + + dev_err(adev->dev, "invalid MMIO read offset 0x%x (rmmio size 0x%x)\n", + offset, (unsigned int)adev->rmmio_size); + return 0; } /** @@ -469,10 +472,13 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) if (amdgpu_device_skip_hw_access(adev)) return; - if (offset < adev->rmmio_size) + if (offset < adev->rmmio_size) { writeb(value, adev->rmmio + offset); - else - BUG(); + } else { + dev_err(adev->dev, "invalid MMIO write offset 0x%x (rmmio size 0x%x)\n", + offset, (unsigned int)adev->rmmio_size); + return; + } } /** diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index e1d63bed84bf..c3293e5a658c 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -308,7 +308,7 @@ void xgpu_vi_init_golden_registers(struct amdgpu_device *adev) xgpu_tonga_golden_common_all)); break; default: - BUG_ON("Doesn't support chip type.\n"); + dev_err(adev->dev, "Doesn't support chip type %d\n", adev->asic_type); break; } } diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index b9a3e842626e..f257ea91a34d 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -968,7 +968,7 @@ bool dm_helpers_is_dp_sink_present(struct dc_link *link) struct amdgpu_dm_connector *aconnector = link->priv; if (!aconnector) { - BUG_ON("Failed to find connector for link!"); + DRM_ERROR("Failed to find connector for link!"); return true; } -- cgit v1.2.3 From fa2886555bdcf6869848e1ef5d4ba19d8b43b95c Mon Sep 17 00:00:00 2001 From: Matthew Stewart Date: Wed, 27 May 2026 10:07:02 -0400 Subject: drm/amd/display: Fix DCN42B version detection In resource_parse_asic_id, the check for GC_11_0_4 was unbounded, which caused it to override the detection of DCN42B. Signed-off-by: Matthew Stewart Reviewed-by: Roman Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/include/dal_asic_id.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h b/drivers/gpu/drm/amd/display/include/dal_asic_id.h index 7d8944d27d92..ca77d29ebacc 100644 --- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h +++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h @@ -261,8 +261,8 @@ enum { #define ASICREV_IS_GC_11_0_0(eChipRev) (eChipRev < GC_11_0_2_A0) #define ASICREV_IS_GC_11_0_2(eChipRev) (eChipRev >= GC_11_0_2_A0 && eChipRev < GC_11_0_3_A0) -#define ASICREV_IS_GC_11_0_3(eChipRev) (eChipRev >= GC_11_0_3_A0 && eChipRev < GC_11_UNKNOWN) -#define ASICREV_IS_GC_11_0_4(eChipRev) (eChipRev >= GC_11_0_4_A0 && eChipRev < GC_11_UNKNOWN) +#define ASICREV_IS_GC_11_0_3(eChipRev) (eChipRev >= GC_11_0_3_A0 && eChipRev < GC_11_0_4_A0) +#define ASICREV_IS_GC_11_0_4(eChipRev) (eChipRev >= GC_11_0_4_A0 && eChipRev < DCN4A_SOC_VAR_B_A0) #define ASICREV_IS_DCN36(eChipRev) ((eChipRev) >= 0x50 && (eChipRev) < 0xC0) #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ -- cgit v1.2.3 From badcb0896bdb4c6148564331c4b24ba7635efa94 Mon Sep 17 00:00:00 2001 From: Matthew Stewart Date: Thu, 28 May 2026 18:21:54 -0400 Subject: drm/amd/display: Add DCN42B to dml21_translation_helper Needed for DML to function with DCN42B. Signed-off-by: Matthew Stewart Reviewed-by: Roman Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml2_0/dml21/dml21_translation_helper.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/dml21_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/dml21_translation_helper.c index c6ff7a290c7f..c1a3e2496983 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/dml21_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2_0/dml21/dml21_translation_helper.c @@ -48,6 +48,7 @@ static enum dml2_project_id dml21_dcn_revision_to_dml2_project_id(enum dce_versi project_id = dml2_project_dcn4x_stage2_auto_drr_svp; break; case DCN_VERSION_4_2: + case DCN_VERSION_4_2B: project_id = dml2_project_dcn42; break; default: -- cgit v1.2.3 From 92a8dba246d371fe268280e5fd74b0955688e6df Mon Sep 17 00:00:00 2001 From: Yongqiang Sun Date: Wed, 27 May 2026 09:50:47 -0400 Subject: drm/amdkfd: fix SMI event cross-process information leak kfd_smi_ev_enabled() skips the suser privilege check when pid=0. PROCESS_START, PROCESS_END, and VMFAULT events are emitted with pid=0 while carrying another process's PID and command name, so any /dev/kfd user in the render group can monitor all GPU workloads. Pass the target process PID into kfd_smi_event_add() for these events so the existing per-client filter restricts delivery to the owning process or CAP_SYS_ADMIN subscribers. Signed-off-by: Yongqiang Sun Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 15975c23a88e..dfbde5a571f6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -254,8 +254,10 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (task_info) { /* Report VM faults from user applications, not retry from kernel */ if (task_info->task.pid) - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT( - task_info->task.pid, task_info->task.comm)); + kfd_smi_event_add(task_info->tgid, dev, + KFD_SMI_EVENT_VMFAULT, + KFD_EVENT_FMT_VMFAULT(task_info->task.pid, + task_info->task.comm)); amdgpu_vm_put_task_info(task_info); } } @@ -356,7 +358,7 @@ void kfd_smi_event_process(struct kfd_process_device *pdd, bool start) task_info = amdgpu_vm_get_task_info_vm(avm); if (task_info) { - kfd_smi_event_add(0, pdd->dev, + kfd_smi_event_add(task_info->tgid, pdd->dev, start ? KFD_SMI_EVENT_PROCESS_START : KFD_SMI_EVENT_PROCESS_END, KFD_EVENT_FMT_PROCESS(task_info->task.pid, -- cgit v1.2.3 From 1d0f5838b1268de871e762bbeb408989dd96b449 Mon Sep 17 00:00:00 2001 From: Vitaly Prosyak Date: Wed, 13 May 2026 16:08:30 -0400 Subject: drm/amdgpu: Add lockdep annotations for lock ordering validation Add lockdep annotations to teach lockdep the correct lock hierarchy and catch ordering violations during development. This follows the pattern established by dma-resv in drivers/dma-buf/dma-resv.c. Lock ordering hierarchy (outermost to innermost): 1. userq_sch_mutex - Global userq scheduler (enforce_isolation) 2. userq_mutex - Per-context userq (held across queue create/destroy) 3. notifier_lock - MMU notifier synchronization 4. vram_lock - VRAM memory allocator 5. reset_domain->sem - GPU reset synchronization 6. reset_lock - Reset control mutex 7. srbm_mutex - SRBM register access 8. grbm_idx_mutex - GRBM index register access 9. mmio_idx_lock - MMIO index access (spinlock) The implementation provides: - Lock ordering training at module init (amdgpu_lockdep_init) - Lock class association for real driver locks (amdgpu_lockdep_set_class) Dummy locks are associated with the same class keys as real driver locks via lockdep_set_class(), ensuring lockdep connects the training ordering with actual runtime locks. Testing: Build the kernel with CONFIG_PROVE_LOCKING=y (enables CONFIG_LOCKDEP): scripts/config --enable PROVE_LOCKING scripts/config --enable DEBUG_LOCKDEP make -j$(nproc) On boot, dmesg should show: AMDGPU: Lockdep annotations initialized (9 lock levels) The companion IGT test (tests/amdgpu/amd_lockdep) exercises lock-heavy GPU code paths concurrently to trigger lockdep warnings on violations: sudo ./build/tests/amdgpu/amd_lockdep sudo dmesg | grep -A 50 "circular locking dependency" IGT subtests: concurrent-reset-and-submit - reset_sem vs submission locks concurrent-mmap-and-evict - mmap_lock vs vram_lock concurrent-userptr-and-reset - notifier_lock vs reset_sem stress-all-paths - all of the above simultaneously A clean dmesg (no "circular locking dependency" or "possible recursive locking detected" messages) confirms no lock ordering violations. For CI integration, the test should be run on kernels compiled with CONFIG_LOCKDEP=y; dmesg is scanned post-run for lockdep splats. v2: (Christian) - Move notifier_lock and vram_lock before reset locks in hierarchy. HMM invalidation holds notifier_lock and can wait for GPU reset completion, so notifier_lock must be outer to reset_domain->sem. - Associate dummy locks with lock class keys via lockdep_set_class() so lockdep connects training with real driver locks. - Update commit message to list all 9 lock levels. Requires CONFIG_PROVE_LOCKING=y to activate. Cc: Christian Konig Cc: Alex Deucher Signed-off-by: Vitaly Prosyak Reviewed-by: Christian Konig Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/Makefile | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c | 195 ++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h | 39 ++++++ 6 files changed, 242 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index ee3574797bc2..ba80542ead9d 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -69,7 +69,7 @@ amdgpu-y += amdgpu_device.o amdgpu_reg_access.o amdgpu_doorbell_mgr.o amdgpu_kms amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \ amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \ amdgpu_fw_attestation.o amdgpu_securedisplay.o \ - amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \ + amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o amdgpu_lockdep.o \ amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o \ amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o amdgpu_ip.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 5d7bfa59424a..7b09410d6d8f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -105,6 +105,7 @@ #include "amdgpu_mca.h" #include "amdgpu_aca.h" #include "amdgpu_ras.h" +#include "amdgpu_lockdep.h" #include "amdgpu_cper.h" #include "amdgpu_xcp.h" #include "amdgpu_seq64.h" diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1dddfde91c49..f18e46502829 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3757,6 +3757,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->pm.stable_pstate_ctx_lock); mutex_init(&adev->benchmark_mutex); mutex_init(&adev->gfx.reset_sem_mutex); + + /* Associate locks with lockdep classes for ordering validation */ + amdgpu_lockdep_set_class(adev); /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ mutex_init(&adev->enforce_isolation_mutex); for (i = 0; i < MAX_XCP; ++i) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 1781c0c3d010..bf4260269681 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -3158,6 +3158,9 @@ static int __init amdgpu_init(void) { int r; + /* Train lockdep on correct lock ordering */ + amdgpu_lockdep_init(); + r = amdgpu_sync_init(); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c new file mode 100644 index 000000000000..d5d71fd7c70d --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Lockdep annotation for AMDGPU lock ordering + * + * This module teaches lockdep the correct lock ordering to catch + * potential deadlocks at development time rather than runtime. + * + * Based on dma-resv lockdep approach from: + * drivers/dma-buf/dma-resv.c:dma_resv_lockdep() + */ + +#include "amdgpu.h" +#include "amdgpu_reset.h" + +#ifdef CONFIG_LOCKDEP + +/* Lock class keys for associating with real driver locks */ +static struct lock_class_key amdgpu_userq_sch_mutex_key; +static struct lock_class_key amdgpu_userq_mutex_key; +static struct lock_class_key amdgpu_notifier_lock_key; +static struct lock_class_key amdgpu_vram_lock_key; +static struct lock_class_key amdgpu_reset_sem_key; +static struct lock_class_key amdgpu_reset_lock_key; +static struct lock_class_key amdgpu_srbm_lock_key; +static struct lock_class_key amdgpu_grbm_lock_key; +static struct lock_class_key amdgpu_mmio_lock_key; + +/** + * amdgpu_lockdep_set_class - Associate lock class keys with real locks + * @adev: AMDGPU device + * + * Call during device init to associate lock classes with actual locks + * so lockdep can track them properly. + */ +void amdgpu_lockdep_set_class(struct amdgpu_device *adev) +{ + lockdep_set_class(&adev->gfx.userq_sch_mutex, + &amdgpu_userq_sch_mutex_key); + lockdep_set_class(&adev->notifier_lock, &amdgpu_notifier_lock_key); + lockdep_set_class(&adev->srbm_mutex, &amdgpu_srbm_lock_key); + lockdep_set_class(&adev->grbm_idx_mutex, &amdgpu_grbm_lock_key); + lockdep_set_class(&adev->mmio_idx_lock, &amdgpu_mmio_lock_key); + + if (adev->reset_domain) + lockdep_set_class(&adev->reset_domain->sem, + &amdgpu_reset_sem_key); +} + +/** + * amdgpu_lockdep_init - Teach lockdep the correct lock ordering + * + * Instantiates dummy objects and takes locks in the correct order to + * train lockdep. This helps catch lock ordering violations during + * development. + * + * Lock ordering hierarchy (outermost to innermost): + * + * 1. userq_sch_mutex - Global userq scheduler (enforce_isolation) + * 2. userq_mutex - Per-context userq (held across queue create/destroy) + * 3. notifier_lock - MMU notifier lock + * 4. vram_lock - VRAM allocator lock + * 5. reset_domain->sem - GPU reset synchronization + * 6. reset_lock - Reset control lock + * 7. srbm_mutex - SRBM register access + * 8. grbm_idx_mutex - GRBM index access + * 9. mmio_idx_lock - MMIO index access (spinlock) + * + * Evidence: + * - userq_sch_mutex -> userq_mutex: amdgpu_gfx_kfd_sch_ctrl() calls + * amdgpu_userq_stop_sched_for_enforce_isolation() which takes userq_mutex + * - userq_mutex -> notifier_lock: userq paths may trigger MMU notifier + * invalidation which acquires notifier_lock + * - notifier_lock -> reset_domain->sem: HMM invalidation callback holds + * notifier_lock and can wait for GPU reset completion, so notifier_lock + * must be outer to reset_domain->sem + * - vram_lock -> reset_domain->sem: VRAM management paths may need to + * wait for ongoing reset to complete + * + * Note: mmap_lock ordering relative to GPU locks is already taught + * by dma-resv (drivers/dma-buf/dma-resv.c). + */ +int amdgpu_lockdep_init(void) +{ + struct amdgpu_reset_domain *reset_domain = NULL; + struct amdgpu_reset_control reset_ctl; + struct mutex userq_sch_mutex; + struct mutex userq_mutex; + struct mutex notifier_lock; + struct mutex vram_lock; + struct mutex srbm_mutex; + struct mutex grbm_idx_mutex; + spinlock_t mmio_idx_lock; + unsigned long flags; + + /* + * Initialize dummy reset domain + */ + reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, + "lockdep_test"); + if (!reset_domain) + return -ENOMEM; + + /* Initialize dummy locks */ + mutex_init(&userq_sch_mutex); + mutex_init(&userq_mutex); + mutex_init(¬ifier_lock); + mutex_init(&vram_lock); + mutex_init(&reset_ctl.reset_lock); + mutex_init(&srbm_mutex); + mutex_init(&grbm_idx_mutex); + spin_lock_init(&mmio_idx_lock); + + /* + * Associate dummy locks with the same class keys used for real + * driver locks. This ensures lockdep connects the ordering learned + * here with the actual locks used at runtime. + */ + lockdep_set_class(&userq_sch_mutex, &amdgpu_userq_sch_mutex_key); + lockdep_set_class(&userq_mutex, &amdgpu_userq_mutex_key); + lockdep_set_class(¬ifier_lock, &amdgpu_notifier_lock_key); + lockdep_set_class(&vram_lock, &amdgpu_vram_lock_key); + lockdep_set_class(&reset_domain->sem, &amdgpu_reset_sem_key); + lockdep_set_class(&reset_ctl.reset_lock, &amdgpu_reset_lock_key); + lockdep_set_class(&srbm_mutex, &amdgpu_srbm_lock_key); + lockdep_set_class(&grbm_idx_mutex, &amdgpu_grbm_lock_key); + lockdep_set_class(&mmio_idx_lock, &amdgpu_mmio_lock_key); + + /* + * Take locks in the correct order to train lockdep. + * This establishes the dependency chain. + */ + + /* Level 1: Global userq scheduler mutex (outermost) */ + mutex_lock(&userq_sch_mutex); + + /* Level 2: Per-context userq mutex */ + mutex_lock(&userq_mutex); + + /* Level 3: MMU notifier lock */ + mutex_lock(¬ifier_lock); + + /* Level 4: VRAM allocator lock */ + mutex_lock(&vram_lock); + + /* Level 5: Reset domain semaphore */ + down_read(&reset_domain->sem); + + /* Level 6: Reset control lock */ + mutex_lock(&reset_ctl.reset_lock); + + /* + * Mark potential memory reclaim boundary. + * GPU operations might trigger memory allocation/reclaim. + */ + fs_reclaim_acquire(GFP_KERNEL); + + /* Level 7: SRBM register access */ + mutex_lock(&srbm_mutex); + + /* Level 8: GRBM index access */ + mutex_lock(&grbm_idx_mutex); + + /* Level 9: MMIO index access (innermost lock, spinlock) */ + spin_lock_irqsave(&mmio_idx_lock, flags); + + /* + * All locks acquired in order. + * Lockdep has now learned the valid dependency chain. + */ + + /* Release in reverse order */ + spin_unlock_irqrestore(&mmio_idx_lock, flags); + mutex_unlock(&grbm_idx_mutex); + mutex_unlock(&srbm_mutex); + + fs_reclaim_release(GFP_KERNEL); + + mutex_unlock(&reset_ctl.reset_lock); + up_read(&reset_domain->sem); + mutex_unlock(&vram_lock); + mutex_unlock(¬ifier_lock); + mutex_unlock(&userq_mutex); + mutex_unlock(&userq_sch_mutex); + + /* Cleanup */ + amdgpu_reset_put_reset_domain(reset_domain); + + pr_info("AMDGPU: Lockdep annotations initialized (9 lock levels)\n"); + + return 0; +} + +#endif /* CONFIG_LOCKDEP */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h new file mode 100644 index 000000000000..04adb58665bf --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Lockdep annotation interface for AMDGPU + */ + +#ifndef __AMDGPU_LOCKDEP_H__ +#define __AMDGPU_LOCKDEP_H__ + +#include + +struct amdgpu_device; + +#ifdef CONFIG_LOCKDEP + +/** + * amdgpu_lockdep_init - Train lockdep on correct lock ordering + * + * Call once during module init to establish the lock dependency chain. + */ +int amdgpu_lockdep_init(void); + +/** + * amdgpu_lockdep_set_class - Associate lock class keys with real locks + * @adev: AMDGPU device + * + * Call during device init to associate lock classes with actual locks. + */ +void amdgpu_lockdep_set_class(struct amdgpu_device *adev); + +#else /* !CONFIG_LOCKDEP */ + +static inline int amdgpu_lockdep_init(void) { return 0; } +static inline void amdgpu_lockdep_set_class(struct amdgpu_device *adev) {} + +#endif /* CONFIG_LOCKDEP */ + +#endif /* __AMDGPU_LOCKDEP_H__ */ -- cgit v1.2.3 From 182bdd59be41595e211ac98406d3637fc6141017 Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 5 May 2026 15:40:04 +0200 Subject: drm/amdgpu: deprecate guilty handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The guilty handling tried to establish a second way of signaling problems with the GPU back to userspace. This caused quite a bunch of issue we had to work around, especially lifetime issues with the drm_sched_entity. Just drop the handling altogether and use the dma_fence based approach instead. v2: fix reversed condition in entity check (Alex) Reviewed-by: Alex Deucher Signed-off-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 5 ----- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 25 +++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +---- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 32af8cce3df8..c42ae3e6fdd1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -60,11 +60,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, if (!p->ctx) return -EINVAL; - if (atomic_read(&p->ctx->guilty)) { - amdgpu_ctx_put(p->ctx); - return -ECANCELED; - } - amdgpu_sync_create(&p->sync); drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT | DRM_EXEC_IGNORE_DUPLICATES, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 7af86a32c0c5..0d7f6cd74f79 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -255,7 +255,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip, } r = drm_sched_entity_init(&entity->entity, drm_prio, scheds, num_scheds, - &ctx->guilty); + NULL); if (r) goto error_free_entity; @@ -579,6 +579,27 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev, #define AMDGPU_RAS_COUNTE_DELAY_MS 3000 +static bool amdgpu_ctx_guilty(struct amdgpu_ctx *ctx) +{ + int i, j, r; + + for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) { + for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) { + struct amdgpu_ctx_entity *ctx_entity; + + ctx_entity = ctx->entities[i][j]; + if (!ctx_entity) + continue; + + r = drm_sched_entity_error(&ctx_entity->entity); + if (r == -ETIME) + return true; + } + } + + return false; +} + static int amdgpu_ctx_query2(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv, uint32_t id, union drm_amdgpu_ctx_out *out) @@ -607,7 +628,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST; - if (atomic_read(&ctx->guilty)) + if (amdgpu_ctx_guilty(ctx)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; if (amdgpu_in_reset(adev)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h index cf8d700a22fe..e444b2088d40 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h @@ -50,7 +50,6 @@ struct amdgpu_ctx { int32_t init_priority; int32_t override_priority; uint32_t stable_pstate; - atomic_t guilty; bool preamble_presented; uint64_t generation; unsigned long ras_counter_ce; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f18e46502829..942f0251c748 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5112,12 +5112,12 @@ link_reset_failed: int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { - int i, r = 0; struct amdgpu_job *job = NULL; struct dma_fence *fence = NULL; struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; bool need_full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + int i, r; if (reset_context->reset_req_dev == adev) job = reset_context->job; @@ -5143,9 +5143,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, amdgpu_fence_driver_isr_toggle(adev, false); - if (job && job->vm) - drm_sched_increase_karma(&job->base); - r = amdgpu_reset_prepare_hwcontext(adev, reset_context); /* If reset handler not implemented, continue; otherwise return */ if (r == -EOPNOTSUPP) -- cgit v1.2.3 From 869de64649cf54d55e196597f819b8c8befe39d0 Mon Sep 17 00:00:00 2001 From: Jeevana Muthyala Date: Thu, 14 May 2026 16:26:17 +0530 Subject: drm/amdgpu/vcn5.0.0: enable secure submission on unified ring for VCN 5.3.0 Enable secure submission support on the unified ring for VCN IP version 5.3.0 by setting `secure_submission_supported = true` in vcn_v5_0_0_unified_ring_vm_funcs. Secure IB submission is supported on VCN 5.3.0 hardware/firmware, allowing protected decode workloads to bypass the common IB gate. Without this, secure playback submissions can be blocked and fail. Other VCN 5.x variants using the same vcn_v5_0_0_ip_block (e.g. IP_VERSION(5, 0, 0)) do not support secure submission on the unified ring and therefore continue using non-secure paths. This change only advertises existing hardware/firmware capability; non-secure decode paths remain unaffected. Signed-off-by: Jeevana Muthyala Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c | 39 ++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c index d5f49fa33bee..45580e9c4e0c 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c @@ -1234,6 +1234,38 @@ static const struct amdgpu_ring_funcs vcn_v5_0_0_unified_ring_vm_funcs = { .reset = vcn_v5_0_0_ring_reset, }; +static const struct amdgpu_ring_funcs vcn_v5_0_0_unified_ring_vm_funcs_secure = { + .type = AMDGPU_RING_TYPE_VCN_ENC, + .align_mask = 0x3f, + .nop = VCN_ENC_CMD_NO_OP, + .secure_submission_supported = true, + .no_user_fence = true, + .get_rptr = vcn_v5_0_0_unified_ring_get_rptr, + .get_wptr = vcn_v5_0_0_unified_ring_get_wptr, + .set_wptr = vcn_v5_0_0_unified_ring_set_wptr, + .emit_frame_size = + SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 + + SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 4 + + 4 + /* vcn_v2_0_enc_ring_emit_vm_flush */ + 5 + 5 + /* vcn_v2_0_enc_ring_emit_fence x2 vm fence */ + 1, /* vcn_v2_0_enc_ring_insert_end */ + .emit_ib_size = 5, /* vcn_v2_0_enc_ring_emit_ib */ + .emit_ib = vcn_v2_0_enc_ring_emit_ib, + .emit_fence = vcn_v2_0_enc_ring_emit_fence, + .emit_vm_flush = vcn_v2_0_enc_ring_emit_vm_flush, + .test_ring = amdgpu_vcn_enc_ring_test_ring, + .test_ib = amdgpu_vcn_unified_ring_test_ib, + .insert_nop = amdgpu_ring_insert_nop, + .insert_end = vcn_v2_0_enc_ring_insert_end, + .pad_ib = amdgpu_ring_generic_pad_ib, + .begin_use = amdgpu_vcn_ring_begin_use, + .end_use = amdgpu_vcn_ring_end_use, + .emit_wreg = vcn_v2_0_enc_ring_emit_wreg, + .emit_reg_wait = vcn_v2_0_enc_ring_emit_reg_wait, + .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, + .reset = vcn_v5_0_0_ring_reset, +}; + /** * vcn_v5_0_0_set_unified_ring_funcs - set unified ring functions * @@ -1249,7 +1281,12 @@ static void vcn_v5_0_0_set_unified_ring_funcs(struct amdgpu_device *adev) if (adev->vcn.harvest_config & (1 << i)) continue; - adev->vcn.inst[i].ring_enc[0].funcs = &vcn_v5_0_0_unified_ring_vm_funcs; + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 3, 0)) + adev->vcn.inst[i].ring_enc[0].funcs = + &vcn_v5_0_0_unified_ring_vm_funcs_secure; + else + adev->vcn.inst[i].ring_enc[0].funcs = + &vcn_v5_0_0_unified_ring_vm_funcs; adev->vcn.inst[i].ring_enc[0].me = i; } } -- cgit v1.2.3 From 0ac98160dfb6ab3c6d7b38e0ff9687780beed9cb Mon Sep 17 00:00:00 2001 From: David Rosca Date: Sat, 13 Sep 2025 16:51:02 +0200 Subject: drm/amdgpu/userq: Fix reading timeline points in wait ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use correct u64 type. Signed-off-by: David Rosca Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index a41fb72dba94..f74ad378e407 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -593,7 +593,7 @@ free_syncobj_handles: static int amdgpu_userq_wait_count_fences(struct drm_file *filp, struct drm_amdgpu_userq_wait *wait_info, - u32 *syncobj_handles, u32 *timeline_points, + u32 *syncobj_handles, u64 *timeline_points, u32 *timeline_handles, struct drm_gem_object **gobj_write, struct drm_gem_object **gobj_read) @@ -703,7 +703,7 @@ amdgpu_userq_wait_add_fence(struct drm_amdgpu_userq_wait *wait_info, static int amdgpu_userq_wait_return_fence_info(struct drm_file *filp, struct drm_amdgpu_userq_wait *wait_info, - u32 *syncobj_handles, u32 *timeline_points, + u32 *syncobj_handles, u64 *timeline_points, u32 *timeline_handles, struct drm_gem_object **gobj_write, struct drm_gem_object **gobj_read) @@ -906,7 +906,8 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { int num_points, num_syncobj, num_read_bo_handles, num_write_bo_handles; - u32 *syncobj_handles, *timeline_points, *timeline_handles; + u32 *syncobj_handles, *timeline_handles; + u64 *timeline_points; struct drm_amdgpu_userq_wait *wait_info = data; struct drm_gem_object **gobj_write; struct drm_gem_object **gobj_read; @@ -935,7 +936,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, } ptr = u64_to_user_ptr(wait_info->syncobj_timeline_points); - timeline_points = memdup_array_user(ptr, num_points, sizeof(u32)); + timeline_points = memdup_array_user(ptr, num_points, sizeof(u64)); if (IS_ERR(timeline_points)) { r = PTR_ERR(timeline_points); goto free_timeline_handles; -- cgit v1.2.3 From 1e13b7eb67f9118130571958fbf94944c71c32d1 Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Tue, 2 Jun 2026 15:16:16 -0400 Subject: drm/amd/display: widen dc_hdmi_frl_flags.force_frl_rate to unsigned int dc_hdmi_frl_flags.force_frl_rate mirrors dc_debug_options.force_frl_rate, which was just widened to unsigned int. Match the type here too so the assignment in link_hdmi_frl.c does not narrow from unsigned to signed. All call sites in link_hdmi_frl.c only compare the value against 0, 0xF, or an hdmi_frl_link_rate enum whose values are non-negative, so the change is behaviour-preserving and does not introduce sign-compare warnings. Signed-off-by: Aurabindo Pillai Reviewed-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc_hdmi_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc_hdmi_types.h b/drivers/gpu/drm/amd/display/dc/dc_hdmi_types.h index 0da03eb794aa..eb6e7f4043fd 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_hdmi_types.h +++ b/drivers/gpu/drm/amd/display/dc/dc_hdmi_types.h @@ -266,7 +266,7 @@ struct dc_hdmi_frl_link_settings { }; struct dc_hdmi_frl_flags { - int force_frl_rate; + unsigned int force_frl_rate; bool ignore_ffe; int select_ffe; int limit_ffe; -- cgit v1.2.3 From 1e815068fba5ea2684c146b445a3b1f6da7eddee Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Tue, 2 Jun 2026 15:17:06 -0400 Subject: drm/amd/display: use unsigned types for local pipe and REG_GET counters Two small type fixes that match how the values are actually consumed: - decide_zstate_support() iterates from 0 to pipe_count, which is unsigned. Make the loop index unsigned int. - hpo_enc401_read_state() reads HDMI_PIXEL_ENCODING and HDMI_DEEP_COLOR_DEPTH via REG_GET_2(), which internally casts the output pointer to (uint32_t *). Passing the address of an int is a strict-aliasing wart even when the sizes match. Declare the locals as uint32_t. No behavioural change since the values are only compared against small non-negative constants. Signed-off-by: Aurabindo Pillai Reviewed-by: Alex Hung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c | 2 +- .../gpu/drm/amd/display/dc/hpo/dcn401/dcn401_hpo_frl_stream_encoder.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c index 5f088d113b9f..38c79239004c 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c @@ -1060,7 +1060,7 @@ static bool is_dtbclk_required(struct dc *dc, struct dc_state *context) static enum dcn_zstate_support_state decide_zstate_support(struct dc *dc, struct dc_state *context) { int plane_count; - int i; + unsigned int i; plane_count = 0; for (i = 0; i < dc->res_pool->pipe_count; i++) { diff --git a/drivers/gpu/drm/amd/display/dc/hpo/dcn401/dcn401_hpo_frl_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/hpo/dcn401/dcn401_hpo_frl_stream_encoder.c index 28cb14dc87b0..85b7a44c0a11 100644 --- a/drivers/gpu/drm/amd/display/dc/hpo/dcn401/dcn401_hpo_frl_stream_encoder.c +++ b/drivers/gpu/drm/amd/display/dc/hpo/dcn401/dcn401_hpo_frl_stream_encoder.c @@ -143,8 +143,8 @@ void hpo_enc401_read_state( struct hpo_frl_stream_encoder *enc, struct hpo_frl_stream_encoder_state *state) { - int pixel_encoding; - int color_depth; + uint32_t pixel_encoding; + uint32_t color_depth; // int odm_combine; struct dcn401_hpo_frl_stream_encoder *enc401 = DCN401_HPO_FRL_STRENC_FROM_HPO_FRL_STRENC(enc); -- cgit v1.2.3 From 59720bfd8c6dbebeb8d5a7ab64241b007efd9213 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 25 Feb 2026 15:12:02 +0100 Subject: drm/amdgpu: restart the CS if some parts of the VM are still invalidated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that we only submit work with full up to date VM page tables. Signed-off-by: Christian König Reviewed-by: Vitaly Prosyak Tested-by: Vitaly Prosyak Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index c42ae3e6fdd1..fc071efd4c25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1274,6 +1274,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; struct amdgpu_job *leader = p->gang_leader; + struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_bo_list_entry *e; struct drm_gem_object *gobj; unsigned long index; @@ -1319,7 +1320,8 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, amdgpu_hmm_range_free(e->range); e->range = NULL; } - if (r) { + + if (r || !list_empty(&vm->individual.moved)) { r = -EAGAIN; mutex_unlock(&p->adev->notifier_lock); return r; -- cgit v1.2.3 From 39eb6da7acee8d0cc12a8959235b590f295d7b4c Mon Sep 17 00:00:00 2001 From: Sunday Clement Date: Tue, 19 May 2026 10:02:30 -0400 Subject: drm/amdkfd: Add bounds check for AMDKFD_IOC_WAIT_EVENTS The kfd_wait_on_events ioctl passes a user-supplied num_events parameter directly to alloc_event_waiters() which calls kcalloc() without validation. This allows unprivileged users with /dev/kfd access to trigger large kernel memory allocations, potentially causing memory exhaustion and denial of service via the OOM killer. Add a check to reject num_events values exceeding KFD_SIGNAL_EVENT_LIMIT (4096), which is the maximum number of events a single process can create. Signed-off-by: Sunday Clement Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index a11c4ab3aafd..81900b49d9d5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -800,6 +800,8 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events) struct kfd_event_waiter *event_waiters; uint32_t i; + if (num_events > KFD_SIGNAL_EVENT_LIMIT) + return NULL; event_waiters = kzalloc_objs(struct kfd_event_waiter, num_events); if (!event_waiters) return NULL; -- cgit v1.2.3 From dd7bd8e0f0c47361a3a513d6aa8ea2b36dd70deb Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Mon, 1 Jun 2026 20:11:17 +0530 Subject: drm/amdgpu: compare MES firmware version ucode for gfx11 MES firmware should report the same version whether read from the register or from the firmware ucode binary. This is not always the case, so add a log when they mismatch. Signed-off-by: Sunil Khatri Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 12 ++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 1 + 3 files changed, 14 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index c9467b26e42c..e3972673fd64 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -781,6 +781,18 @@ out: return r; } +void amdgpu_mes_validate_fw_version(struct amdgpu_device *adev) +{ + u32 fw_from_ucode = adev->mes.fw_version[AMDGPU_MES_SCHED_PIPE]; + u32 fw_from_reg = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK; + + if (fw_from_ucode != fw_from_reg) + dev_info(adev->dev, + "MES firmware reports incorrect version in ucode binary (0x%x vs 0x%x)\n", + fw_from_ucode, fw_from_reg); +} + + bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev) { uint32_t mes_rev = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 93990d4990f2..fdd06a17520a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -441,6 +441,7 @@ struct amdgpu_mes_funcs { (adev)->mes.kiq_hw_fini((adev), (xcc_id)) int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe); +void amdgpu_mes_validate_fw_version(struct amdgpu_device *adev); int amdgpu_mes_init(struct amdgpu_device *adev); void amdgpu_mes_fini(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 147ba2942690..ac6d4f277336 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1688,6 +1688,7 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block) if (r) goto failure; + amdgpu_mes_validate_fw_version(adev); out: /* * Disable KIQ ring usage from the driver once MES is enabled. -- cgit v1.2.3 From 1720970f607d17b43274a06a7fd919e37a429281 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Mon, 1 Jun 2026 20:14:50 +0530 Subject: drm/amdgpu: validate the mes firmware version for gfx12 MES firmware should report the same version whether read from the register or from the firmware ucode binary. This is not always the case, so add a log when they mismatch. Signed-off-by: Sunil Khatri Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 023c7345ea54..7453fb11289e 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -1871,6 +1871,7 @@ static int mes_v12_0_hw_init(struct amdgpu_ip_block *ip_block) if (r) goto failure; + amdgpu_mes_validate_fw_version(adev); out: /* * Disable KIQ ring usage from the driver once MES is enabled. -- cgit v1.2.3 From bfc6042540b7795d2f96a6ddc71442f74438dc73 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Mon, 1 Jun 2026 20:15:34 +0530 Subject: drm/amdgpu: validate the mes firmware version for gfx12.1 MES firmware should report the same version whether read from the register or from the firmware ucode binary. This is not always the case, so add a log when they mismatch. Signed-off-by: Sunil Khatri Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mes_v12_1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c index b169e577e583..8a90ad5a51b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_1.c @@ -1917,6 +1917,7 @@ static int mes_v12_1_xcc_hw_init(struct amdgpu_ip_block *ip_block, int xcc_id) goto failure; } + amdgpu_mes_validate_fw_version(adev); out: /* * Disable KIQ ring usage from the driver once MES is enabled. -- cgit v1.2.3 From 01112e241e37f9ac98b6f418d93ce2e0b87b7ee0 Mon Sep 17 00:00:00 2001 From: Yongqiang Sun Date: Tue, 2 Jun 2026 09:59:44 -0400 Subject: drm/amdkfd: Unwind debug trap enable on copy_to_user failure If kfd_dbg_trap_enable() fails while copying runtime_info to userspace, it had already activated the trap, set debug_trap_enabled, taken an extra process reference, and opened the debug event file. Return -EFAULT without unwinding that state, leaving inconsistent trap state and a refcount imbalance that could break later DISABLE/ENABLE. On copy_to_user failure, deactivate the trap and undo the rest of the enable setup before returning. Signed-off-by: Yongqiang Sun Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index 0f7aa51b629e..0dd1fd448059 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -832,6 +832,12 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { kfd_dbg_trap_deactivate(target, false, 0); + fput(target->dbg_ev_file); + target->dbg_ev_file = NULL; + if (target->debugger_process) + atomic_dec(&target->debugger_process->debugged_process_count); + target->debug_trap_enabled = false; + kfd_unref_process(target); r = -EFAULT; } -- cgit v1.2.3 From d2be142124f40be1e5b6c7d25e9f9069bc874c1f Mon Sep 17 00:00:00 2001 From: Asad Kamal Date: Wed, 3 Jun 2026 15:11:33 +0800 Subject: drm/amd/pm: Stop pp_od_clk_voltage emit at PAGE_SIZE Stop appending OD sections in amdgpu_get_pp_od_clk_voltage() once the sysfs page is full, instead of checking every sysfs_emit_at() in SMU helpers. This is purely defensive hardening. v2: Drop the prior series that checked sysfs_emit_at() return values in every SMU *_emit_clk_levels() helper and smu_cmn_print_*().(Kevin) v3: Update description, remove all clamping Signed-off-by: Asad Kamal Reviewed-by: Yang Wang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 52e5cbcac352..714c702b4f8c 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -874,6 +874,8 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev, for (clk_index = 0 ; clk_index < ARRAY_SIZE(od_clocks) ; clk_index++) { amdgpu_dpm_emit_clock_levels(adev, od_clocks[clk_index], buf, &size); + if (unlikely(size >= (PAGE_SIZE - 1))) + break; } if (size == 0) -- cgit v1.2.3 From 9ab125397e9f461f171be383af57886c4eeb8d42 Mon Sep 17 00:00:00 2001 From: Candice Li Date: Wed, 20 May 2026 12:33:18 +0800 Subject: drm/amd/pm: bound OD parameter parsing to stack array size Reject inputs once parameter_size reaches the array limit, and pass ARRAY_SIZE(parameter) into parse_input_od_command_lines() for defense in depth. Signed-off-by: Candice Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 714c702b4f8c..a21d1506e6ab 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -806,6 +806,8 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev, while ((sub_str = strsep(&tmp_str, delimiter)) != NULL) { if (strlen(sub_str) == 0) continue; + if (parameter_size >= ARRAY_SIZE(parameter)) + return -EINVAL; ret = kstrtol(sub_str, 0, ¶meter[parameter_size]); if (ret) return -EINVAL; @@ -3957,6 +3959,7 @@ static int parse_input_od_command_lines(const char *buf, size_t count, u32 *type, long *params, + size_t params_max, uint32_t *num_of_params) { const char delimiter[3] = {' ', '\n', '\0'}; @@ -3992,6 +3995,9 @@ static int parse_input_od_command_lines(const char *buf, if (strlen(sub_str) == 0) continue; + if (parameter_size >= params_max) + return -EINVAL; + ret = kstrtol(sub_str, 0, ¶ms[parameter_size]); if (ret) return -EINVAL; @@ -4023,6 +4029,7 @@ amdgpu_distribute_custom_od_settings(struct amdgpu_device *adev, count, &cmd_type, parameter, + ARRAY_SIZE(parameter), ¶meter_size); if (ret) return ret; -- cgit v1.2.3 From 342981fff32802a819d6fc7cf3c9fedf9f3d9d60 Mon Sep 17 00:00:00 2001 From: Honglei Huang Date: Fri, 29 May 2026 10:23:17 +0800 Subject: drm/amdgpu: drop retry loop in amdgpu_hmm_range_get_pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit c08972f55594 ("drm/amdgpu: fix amdgpu_hmm_range_get_pages") moved mmu_interval_read_begin() out of the per-chunk loop, the captured notifier_seq is no longer refreshed across retries. As a result, the existing -EBUSY retry path can never make progress: hmm_range_fault() returns -EBUSY only when mmu_interval_check_retry(notifier, notifier_seq) reports that the sequence is stale. Once the sequence has advanced, the stored seq will never match again, so every subsequent call within the same invocation returns -EBUSY immediately. The "goto retry" therefore degenerates into a busy spin that simply burns CPU for the full HMM_RANGE_DEFAULT_TIMEOUT (~1s) window before finally bailing out with -EAGAIN. This is pure latency with no chance of recovery, and it actively hurts the KFD userptr stack: the caller ends up blocked for a second while holding mmap_lock, only to return -EAGAIN to the restore worker (or to userspace) which would have re-driven the operation immediately anyway. Drop the retry/timeout entirely and let -EBUSY propagate straight to out_free_pfns, where it is already translated to -EAGAIN. Recovery is handled at a higher level: the KFD restore_userptr_worker reschedules itself, and the userptr ioctl path returns -EAGAIN to userspace. No functional regression: the previous behaviour on -EBUSY was already to fail with -EAGAIN after a 1s stall; we just skip the stall. Reviewed-by: Christian König Signed-off-by: Honglei Huang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index e452444b33b0..99bc9ad67d5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -174,7 +174,6 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, const u64 max_bytes = SZ_2G; struct hmm_range *hmm_range = &range->hmm_range; - unsigned long timeout; unsigned long *pfns; unsigned long end; int r; @@ -201,15 +200,9 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, pr_debug("hmm range: start = 0x%lx, end = 0x%lx", hmm_range->start, hmm_range->end); - timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); - -retry: r = hmm_range_fault(hmm_range); - if (unlikely(r)) { - if (r == -EBUSY && !time_after(jiffies, timeout)) - goto retry; + if (unlikely(r)) goto out_free_pfns; - } if (hmm_range->end == end) break; -- cgit v1.2.3 From 8bfd3aeeb7d4c140434bb9e604fca39ebb3e2937 Mon Sep 17 00:00:00 2001 From: Yongqiang Sun Date: Mon, 1 Jun 2026 15:48:44 -0400 Subject: drm/amdkfd: fix sysfs topology prop length on buffer truncation sysfs_show_gen_prop() accumulated snprintf()'s return value into the offset. snprintf() reports bytes that would have been written, not bytes actually written, so a truncated sysfs show could over-report its length. Use sysfs_emit_at(), which returns only the bytes written. Signed-off-by: Yongqiang Sun Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index dc1c6bd1252f..00517c3d0e6a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -198,8 +198,7 @@ struct kfd_topology_device *kfd_create_topology_device( #define sysfs_show_gen_prop(buffer, offs, fmt, ...) \ - (offs += snprintf(buffer+offs, PAGE_SIZE-offs, \ - fmt, __VA_ARGS__)) + (offs += sysfs_emit_at(buffer, offs, fmt, __VA_ARGS__)) #define sysfs_show_32bit_prop(buffer, offs, name, value) \ sysfs_show_gen_prop(buffer, offs, "%s %u\n", name, value) #define sysfs_show_64bit_prop(buffer, offs, name, value) \ -- cgit v1.2.3 From 961323c26ad4c895e3b0ea1711fc41dfd6368c12 Mon Sep 17 00:00:00 2001 From: Yongqiang Sun Date: Mon, 1 Jun 2026 15:28:30 -0400 Subject: drm/amdkfd: Fix infinite loop parsing CRAT with zero subtype length Malformed ACPI CRAT tables can advertise a zero or undersized subtype length. The parser then fails to advance the cursor and loops forever while the remaining image still looks large enough for a generic header. Validate sub_type_hdr->length on each iteration before parsing or advancing. Return -EINVAL and warn when length is zero or smaller than the generic subtype header. Signed-off-by: Yongqiang Sun Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index af2ae144f508..f28259d13818 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -1404,6 +1404,14 @@ int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < ((char *)crat_image) + image_len) { + if (!sub_type_hdr->length || + sub_type_hdr->length < sizeof(struct crat_subtype_generic)) { + pr_warn("Invalid CRAT subtype length %u\n", + sub_type_hdr->length); + ret = -EINVAL; + break; + } + if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { ret = kfd_parse_subtype(sub_type_hdr, device_list); if (ret) -- cgit v1.2.3 From 408b17765b7ae73b299eccaa3bc2e8c7f1555741 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 19 May 2026 18:30:03 +0530 Subject: drm/amd/pm: Use strscpy in profile mode parsing Use strscpy to copy the buffer which makes it explicit that a valid NULL terminated string gets copied. Also, make it explicit that the source buffer can be copied safely to the temporary buffer by checking against its size. Signed-off-by: Lijo Lazar Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index a21d1506e6ab..f43d09769320 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -1393,7 +1393,6 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev, long parameter[64]; char *sub_str, buf_cpy[128]; char *tmp_str; - uint32_t i = 0; char tmp[2]; long int profile_mode = 0; const char delimiter[3] = {' ', '\n', '\0'}; @@ -1402,18 +1401,18 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev, if (count == 0 || sysfs_streq(buf, "")) return -EINVAL; - tmp[0] = *(buf); + tmp[0] = *(buf++); tmp[1] = '\0'; ret = kstrtol(tmp, 0, &profile_mode); if (ret) return -EINVAL; if (profile_mode == PP_SMC_POWER_PROFILE_CUSTOM) { - if (count < 2 || count > 127) + if (count < 2 || count > sizeof(buf_cpy)) return -EINVAL; - while (isspace(*++buf)) - i++; - memcpy(buf_cpy, buf, count-i); + while (isspace(*buf)) + buf++; + strscpy(buf_cpy, buf, sizeof(buf_cpy)); tmp_str = buf_cpy; while ((sub_str = strsep(&tmp_str, delimiter)) != NULL) { if (strlen(sub_str) == 0) -- cgit v1.2.3 From 115bf5ca318e18a3dc1888ec6271c7052774952a Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Mon, 18 May 2026 17:48:09 +0200 Subject: drm/amd/display: Consult MCCS FreeSync cap only if requested & supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the do_mccs parameter is false, we don't call dm_helpers_read_mccs_caps, so sink->mccs_caps.freesync_supported is unlikely to be true. Fixes: 6f71d5dd3206 ("drm/amd/display: Read sink freesync support via mccs") Bug: https://gitlab.freedesktop.org/drm/amd/-/work_items/5286 Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 04e440521f67..1ed697a3a453 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -13837,17 +13837,15 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector, } /* Handle MCCS */ - if (do_mccs) + if (do_mccs) { dm_helpers_read_mccs_caps(adev->dm.dc->ctx, amdgpu_dm_connector->dc_link, sink); - if ((sink->sink_signal == SIGNAL_TYPE_HDMI_TYPE_A || - as_type == FREESYNC_TYPE_PCON_IN_WHITELIST) && - (!sink->edid_caps.freesync_vcp_code || - (sink->edid_caps.freesync_vcp_code && !sink->mccs_caps.freesync_supported))) - freesync_capable = false; + if (sink->edid_caps.freesync_vcp_code && !sink->mccs_caps.freesync_supported) + freesync_capable = false; - if (do_mccs && sink->mccs_caps.freesync_supported && freesync_capable) - dm_helpers_mccs_vcp_set(adev->dm.dc->ctx, amdgpu_dm_connector->dc_link, sink); + if (sink->mccs_caps.freesync_supported && freesync_capable) + dm_helpers_mccs_vcp_set(adev->dm.dc->ctx, amdgpu_dm_connector->dc_link, sink); + } update: if (dm_con_state) -- cgit v1.2.3 From 9117d8be850baf0f89b65ff399442fb59b1a1beb Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Wed, 27 May 2026 18:05:37 -0400 Subject: drm/amdgpu/gfx: move fault and EOP IRQ get/put to hw_init/hw_fini priv_reg / priv_inst / bad_op and (on v11+) userq EOP IRQs are acquired in late_init but released in hw_fini. This split forced gfx_v9_0_hw_fini() to defensively guard each put with amdgpu_irq_enabled() because hw_fini runs on paths that may not reach late_init. amdgpu_ip_block_hw_fini() only runs after hw_init returns success, and suspend / resume cycle the refs through the same path, so hw_init / hw_fini pair without any extra tracking. Move the gets there and drop the guards. While here, fix the pre-existing partial-failure leak in set_userq_eop_interrupts() (gfx11 / 12_0 / 12_1). amdgpu_irq_get() increments the refcount before calling .set, so a failure partway through the loop leaves earlier successful gets stranded. Track the loop position and roll back on the enable path. Signed-off-by: Yunxiang Li Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 43 ++++----- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 162 +++++++++++++++++++------------- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 162 +++++++++++++++++++------------- drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c | 114 ++++++++++++---------- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 34 ++++--- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 35 ++++--- 6 files changed, 314 insertions(+), 236 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 58c69dcb527f..0780c5e5de4f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -7530,6 +7530,24 @@ static int gfx_v10_0_hw_init(struct amdgpu_ip_block *ip_block) if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0) && !amdgpu_sriov_vf(adev)) gfx_v10_3_set_power_brake_sequence(adev); + r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); + if (r) + return r; + + r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); + if (r) + goto err_priv_inst; + + r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); + if (r) + goto err_bad_op; + + return 0; + +err_bad_op: + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); +err_priv_inst: + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); return r; } @@ -7539,9 +7557,9 @@ static int gfx_v10_0_hw_fini(struct amdgpu_ip_block *ip_block) cancel_delayed_work_sync(&adev->gfx.idle_work); - amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); - amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); /* WA added for Vangogh asic fixing the SMU suspend failure * It needs to set power gating again during gfxoff control @@ -7837,26 +7855,6 @@ static int gfx_v10_0_early_init(struct amdgpu_ip_block *ip_block) return gfx_v10_0_init_microcode(adev); } -static int gfx_v10_0_late_init(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - int r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); - if (r) - return r; - - return 0; -} - static bool gfx_v10_0_is_rlc_enabled(struct amdgpu_device *adev) { uint32_t rlc_cntl; @@ -9805,7 +9803,6 @@ static void gfx_v10_0_ring_end_use(struct amdgpu_ring *ring) static const struct amd_ip_funcs gfx_v10_0_ip_funcs = { .name = "gfx_v10_0", .early_init = gfx_v10_0_early_init, - .late_init = gfx_v10_0_late_init, .sw_init = gfx_v10_0_sw_init, .sw_fini = gfx_v10_0_sw_fini, .hw_init = gfx_v10_0_hw_init, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 1941bfbcbfbf..f856b0cf5bec 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -4814,6 +4814,78 @@ static void gfx_v11_0_disable_gpa_mode(struct amdgpu_device *adev) WREG32_SOC15(GC, 0, regCPG_PSP_DEBUG, data); } +static int gfx_v11_0_set_userq_eop_interrupts(struct amdgpu_device *adev, + bool enable) +{ + unsigned int irq_type; + int m, p, r; + + if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) { + for (m = 0; m < adev->gfx.me.num_me; m++) { + for (p = 0; p < adev->gfx.me.num_pipe_per_me; p++) { + irq_type = AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP + p; + if (enable) + r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, irq_type); + else + r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + if (r) { + if (!enable) + return r; + goto err_gfx; + } + } + } + } + + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) { + for (m = 0; m < adev->gfx.mec.num_mec; ++m) { + for (p = 0; p < adev->gfx.mec.num_pipe_per_mec; p++) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + + p; + if (enable) + r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, irq_type); + else + r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + if (r) { + if (!enable) + return r; + goto err_compute; + } + } + } + } + + return 0; + +err_compute: + for (p--; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + for (m--; m >= 0; m--) { + for (p = adev->gfx.mec.num_pipe_per_mec - 1; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + } + m = adev->gfx.me.num_me; +err_gfx: + for (p--; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + for (m--; m >= 0; m--) { + for (p = adev->gfx.me.num_pipe_per_me - 1; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + } + return r; +} + static int gfx_v11_0_hw_init(struct amdgpu_ip_block *ip_block) { int r; @@ -4911,50 +4983,31 @@ static int gfx_v11_0_hw_init(struct amdgpu_ip_block *ip_block) if (!adev->gfx.imu_fw_version) adev->gfx.imu_fw_version = RREG32_SOC15(GC, 0, regGFX_IMU_SCRATCH_0); - return r; -} + r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); + if (r) + return r; -static int gfx_v11_0_set_userq_eop_interrupts(struct amdgpu_device *adev, - bool enable) -{ - unsigned int irq_type; - int m, p, r; + r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); + if (r) + goto err_priv_inst; - if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) { - for (m = 0; m < adev->gfx.me.num_me; m++) { - for (p = 0; p < adev->gfx.me.num_pipe_per_me; p++) { - irq_type = AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP + p; - if (enable) - r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, - irq_type); - else - r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, - irq_type); - if (r) - return r; - } - } - } + r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); + if (r) + goto err_bad_op; - if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) { - for (m = 0; m < adev->gfx.mec.num_mec; ++m) { - for (p = 0; p < adev->gfx.mec.num_pipe_per_mec; p++) { - irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP - + (m * adev->gfx.mec.num_pipe_per_mec) - + p; - if (enable) - r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, - irq_type); - else - r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, - irq_type); - if (r) - return r; - } - } - } + r = gfx_v11_0_set_userq_eop_interrupts(adev, true); + if (r) + goto err_userq_eop; return 0; + +err_userq_eop: + amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); +err_bad_op: + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); +err_priv_inst: + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); + return r; } static int gfx_v11_0_hw_fini(struct amdgpu_ip_block *ip_block) @@ -4963,10 +5016,10 @@ static int gfx_v11_0_hw_fini(struct amdgpu_ip_block *ip_block) cancel_delayed_work_sync(&adev->gfx.idle_work); - amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); - amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); - amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); gfx_v11_0_set_userq_eop_interrupts(adev, false); + amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); if (!adev->no_hw_access) { if (amdgpu_async_gfx_ring && @@ -5356,30 +5409,6 @@ static int gfx_v11_0_early_init(struct amdgpu_ip_block *ip_block) return gfx_v11_0_init_microcode(adev); } -static int gfx_v11_0_late_init(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - int r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); - if (r) - return r; - - r = gfx_v11_0_set_userq_eop_interrupts(adev, true); - if (r) - return r; - - return 0; -} - static bool gfx_v11_0_is_rlc_enabled(struct amdgpu_device *adev) { uint32_t rlc_cntl; @@ -7211,7 +7240,6 @@ static void gfx_v11_0_ring_end_use(struct amdgpu_ring *ring) static const struct amd_ip_funcs gfx_v11_0_ip_funcs = { .name = "gfx_v11_0", .early_init = gfx_v11_0_early_init, - .late_init = gfx_v11_0_late_init, .sw_init = gfx_v11_0_sw_init, .sw_fini = gfx_v11_0_sw_fini, .hw_init = gfx_v11_0_hw_init, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index f47928dcd848..f66293fc675e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -3655,6 +3655,78 @@ static void gfx_v12_0_init_golden_registers(struct amdgpu_device *adev) } } +static int gfx_v12_0_set_userq_eop_interrupts(struct amdgpu_device *adev, + bool enable) +{ + unsigned int irq_type; + int m, p, r; + + if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) { + for (m = 0; m < adev->gfx.me.num_me; m++) { + for (p = 0; p < adev->gfx.me.num_pipe_per_me; p++) { + irq_type = AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP + p; + if (enable) + r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, irq_type); + else + r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + if (r) { + if (!enable) + return r; + goto err_gfx; + } + } + } + } + + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) { + for (m = 0; m < adev->gfx.mec.num_mec; ++m) { + for (p = 0; p < adev->gfx.mec.num_pipe_per_mec; p++) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + + p; + if (enable) + r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, irq_type); + else + r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + if (r) { + if (!enable) + return r; + goto err_compute; + } + } + } + } + + return 0; + +err_compute: + for (p--; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + for (m--; m >= 0; m--) { + for (p = adev->gfx.mec.num_pipe_per_mec - 1; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + } + m = adev->gfx.me.num_me; +err_gfx: + for (p--; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + for (m--; m >= 0; m--) { + for (p = adev->gfx.me.num_pipe_per_me - 1; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + } + return r; +} + static int gfx_v12_0_hw_init(struct amdgpu_ip_block *ip_block) { int r; @@ -3742,50 +3814,31 @@ static int gfx_v12_0_hw_init(struct amdgpu_ip_block *ip_block) if (r) return r; - return r; -} + r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); + if (r) + return r; -static int gfx_v12_0_set_userq_eop_interrupts(struct amdgpu_device *adev, - bool enable) -{ - unsigned int irq_type; - int m, p, r; + r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); + if (r) + goto err_priv_inst; - if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) { - for (m = 0; m < adev->gfx.me.num_me; m++) { - for (p = 0; p < adev->gfx.me.num_pipe_per_me; p++) { - irq_type = AMDGPU_CP_IRQ_GFX_ME0_PIPE0_EOP + p; - if (enable) - r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, - irq_type); - else - r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, - irq_type); - if (r) - return r; - } - } - } + r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); + if (r) + goto err_bad_op; - if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) { - for (m = 0; m < adev->gfx.mec.num_mec; ++m) { - for (p = 0; p < adev->gfx.mec.num_pipe_per_mec; p++) { - irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP - + (m * adev->gfx.mec.num_pipe_per_mec) - + p; - if (enable) - r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, - irq_type); - else - r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, - irq_type); - if (r) - return r; - } - } - } + r = gfx_v12_0_set_userq_eop_interrupts(adev, true); + if (r) + goto err_userq_eop; return 0; + +err_userq_eop: + amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); +err_bad_op: + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); +err_priv_inst: + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); + return r; } static int gfx_v12_0_hw_fini(struct amdgpu_ip_block *ip_block) @@ -3795,10 +3848,10 @@ static int gfx_v12_0_hw_fini(struct amdgpu_ip_block *ip_block) cancel_delayed_work_sync(&adev->gfx.idle_work); - amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); - amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); - amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); gfx_v12_0_set_userq_eop_interrupts(adev, false); + amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); if (!adev->no_hw_access) { if (amdgpu_async_gfx_ring) { @@ -3927,30 +3980,6 @@ static int gfx_v12_0_early_init(struct amdgpu_ip_block *ip_block) return gfx_v12_0_init_microcode(adev); } -static int gfx_v12_0_late_init(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - int r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); - if (r) - return r; - - r = gfx_v12_0_set_userq_eop_interrupts(adev, true); - if (r) - return r; - - return 0; -} - static bool gfx_v12_0_is_rlc_enabled(struct amdgpu_device *adev) { uint32_t rlc_cntl; @@ -5440,7 +5469,6 @@ static void gfx_v12_0_ring_end_use(struct amdgpu_ring *ring) static const struct amd_ip_funcs gfx_v12_0_ip_funcs = { .name = "gfx_v12_0", .early_init = gfx_v12_0_early_init, - .late_init = gfx_v12_0_late_init, .sw_init = gfx_v12_0_sw_init, .sw_fini = gfx_v12_0_sw_fini, .hw_init = gfx_v12_0_hw_init, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c index 033f15e21ad3..61c3577f829f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_1.c @@ -2735,6 +2735,50 @@ static void gfx_v12_1_init_golden_registers(struct amdgpu_device *adev) } } +static int gfx_v12_1_set_userq_eop_interrupts(struct amdgpu_device *adev, + bool enable) +{ + unsigned int irq_type; + int m, p, r; + + if (!adev->gfx.disable_kq) + return 0; + + for (m = 0; m < adev->gfx.mec.num_mec; ++m) { + for (p = 0; p < adev->gfx.mec.num_pipe_per_mec; p++) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + + p; + if (enable) + r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, irq_type); + else + r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + if (r) { + if (!enable) + return r; + goto err_unwind; + } + } + } + + return 0; + +err_unwind: + for (p--; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + for (m--; m >= 0; m--) { + for (p = adev->gfx.mec.num_pipe_per_mec - 1; p >= 0; p--) { + irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + + (m * adev->gfx.mec.num_pipe_per_mec) + p; + amdgpu_irq_put(adev, &adev->gfx.eop_irq, irq_type); + } + } + return r; +} + static int gfx_v12_1_hw_init(struct amdgpu_ip_block *ip_block) { int r, i, num_xcc; @@ -2803,6 +2847,24 @@ static int gfx_v12_1_hw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); + if (r) + return r; + + r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); + if (r) + goto err_priv_inst; + + r = gfx_v12_1_set_userq_eop_interrupts(adev, true); + if (r) + goto err_userq_eop; + + return 0; + +err_userq_eop: + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); +err_priv_inst: + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); return r; } @@ -2828,41 +2890,14 @@ static void gfx_v12_1_xcc_fini(struct amdgpu_device *adev, gfx_v12_1_xcc_enable_gui_idle_interrupt(adev, false, xcc_id); } -static int gfx_v12_1_set_userq_eop_interrupts(struct amdgpu_device *adev, - bool enable) -{ - unsigned int irq_type; - int m, p, r; - - if (adev->gfx.disable_kq) { - for (m = 0; m < adev->gfx.mec.num_mec; ++m) { - for (p = 0; p < adev->gfx.mec.num_pipe_per_mec; p++) { - irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP - + (m * adev->gfx.mec.num_pipe_per_mec) - + p; - if (enable) - r = amdgpu_irq_get(adev, &adev->gfx.eop_irq, - irq_type); - else - r = amdgpu_irq_put(adev, &adev->gfx.eop_irq, - irq_type); - if (r) - return r; - } - } - } - - return 0; -} - static int gfx_v12_1_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; int i, num_xcc; - amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); - amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); gfx_v12_1_set_userq_eop_interrupts(adev, false); + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); num_xcc = NUM_XCC(adev->gfx.xcc_mask); for (i = 0; i < num_xcc; i++) { @@ -2963,26 +2998,6 @@ static int gfx_v12_1_early_init(struct amdgpu_ip_block *ip_block) return gfx_v12_1_init_microcode(adev); } -static int gfx_v12_1_late_init(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - int r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); - if (r) - return r; - - r = gfx_v12_1_set_userq_eop_interrupts(adev, true); - if (r) - return r; - - return 0; -} - static bool gfx_v12_1_is_rlc_enabled(struct amdgpu_device *adev) { uint32_t rlc_cntl; @@ -3876,7 +3891,6 @@ static void gfx_v12_1_emit_mem_sync(struct amdgpu_ring *ring) static const struct amd_ip_funcs gfx_v12_1_ip_funcs = { .name = "gfx_v12_1", .early_init = gfx_v12_1_early_init, - .late_init = gfx_v12_1_late_init, .sw_init = gfx_v12_1_sw_init, .sw_fini = gfx_v12_1_sw_fini, .hw_init = gfx_v12_1_hw_init, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 60376d43e81d..47721d0c3781 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4050,6 +4050,24 @@ static int gfx_v9_0_hw_init(struct amdgpu_ip_block *ip_block) !amdgpu_sriov_vf(adev)) gfx_v9_4_2_set_power_brake_sequence(adev); + r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); + if (r) + return r; + + r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); + if (r) + goto err_priv_inst; + + r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); + if (r) + goto err_bad_op; + + return 0; + +err_bad_op: + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); +err_priv_inst: + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); return r; } @@ -4057,9 +4075,9 @@ static int gfx_v9_0_hw_fini(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); - amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); /* DF freeze and kcq disable will fail */ if (!amdgpu_ras_intr_triggered()) @@ -4860,18 +4878,6 @@ static int gfx_v9_0_late_init(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; int r; - r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); - if (r) - return r; - r = gfx_v9_0_ecc_late_init(ip_block); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 9f76e1af8a55..510266ba0c38 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -2371,6 +2371,24 @@ static int gfx_v9_4_3_hw_init(struct amdgpu_ip_block *ip_block) if (r) return r; + r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); + if (r) + return r; + + r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); + if (r) + goto err_priv_inst; + + r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); + if (r) + goto err_bad_op; + + return 0; + +err_bad_op: + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); +err_priv_inst: + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); return r; } @@ -2446,9 +2464,9 @@ static int gfx_v9_4_3_hw_fini(struct amdgpu_ip_block *ip_block) if (adev->psp.ptl.hw_supported && !amdgpu_in_reset(adev)) gfx_v9_4_3_perf_monitor_ptl_init(adev, false); - amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); - amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); num_xcc = NUM_XCC(adev->gfx.xcc_mask); for (i = 0; i < num_xcc; i++) { @@ -2611,19 +2629,6 @@ static int gfx_v9_4_3_early_init(struct amdgpu_ip_block *ip_block) static int gfx_v9_4_3_late_init(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; - int r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_reg_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.priv_inst_irq, 0); - if (r) - return r; - - r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); - if (r) - return r; if (adev->gfx.ras && adev->gfx.ras->enable_watchdog_timer) -- cgit v1.2.3 From 56ae73c92e200e630c2bdf1e98c88b86c8483b37 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 6 May 2026 16:50:42 -0400 Subject: drm/amdkfd: always resume_all after suspend_all Need to restore any good queues even if the suspend_all failed for some. Always run remove_queue as that will schedule a GPU reset is removing the queue fails. v2: move resume_all after remove Fixes: eb067d65c33e ("drm/amdkfd: Update BadOpcode Interrupt handling with MES") Reviewed-by: Amber Lin Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 5150511cefc5..2e010c1f8828 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -3264,32 +3264,24 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbel list_for_each_entry(q, &qpd->queues_list, list) { if (q->doorbell_id == doorbell_id && q->properties.is_active) { - ret = suspend_all_queues_mes(dqm); - if (ret) { - dev_err(dev, "Suspending all queues failed"); - goto out; - } + /* suspend all queues will save any good queues and mark the rest as bad */ + suspend_all_queues_mes(dqm); q->properties.is_evicted = true; q->properties.is_active = false; decrement_queue_count(dqm, qpd, q); + /* this will remove the bad queue and sched a GPU reset if needed */ ret = remove_queue_mes(dqm, q, qpd); - if (ret) { - dev_err(dev, "Removing bad queue failed"); - goto out; - } - - ret = resume_all_queues_mes(dqm); if (ret) - dev_err(dev, "Resuming all queues failed"); - + dev_err(dev, "Removing bad queue failed"); + /* resume the good queues */ + resume_all_queues_mes(dqm); break; } } } -out: dqm_unlock(dqm); kfd_unref_process(p); return ret; -- cgit v1.2.3