From ff69bba05f085cd6d4277c27ac7600160167b384 Mon Sep 17 00:00:00 2001 From: Boyuan Zhang Date: Wed, 2 Oct 2024 23:52:01 -0400 Subject: drm/amd/pm: add inst to dpm_set_powergating_by_smu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an instance parameter to amdgpu_dpm_set_powergating_by_smu() function, and use the instance to call set_powergating_by_smu(). v2: remove duplicated functions. remove for-loop in amdgpu_dpm_set_powergating_by_smu(), and temporarily move it to amdgpu_dpm_enable_vcn(), in order to keep the exact same logic as before, until further separation in next patch. v3: drop SI logic in amdgpu_dpm_enable_vcn(). Signed-off-by: Boyuan Zhang Acked-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 69a6b6dba0a5..e54f42e3797e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -806,7 +806,7 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) /* If going to s2idle, no need to wait */ if (adev->in_s0ix) { if (!amdgpu_dpm_set_powergating_by_smu(adev, - AMD_IP_BLOCK_TYPE_GFX, true)) + AMD_IP_BLOCK_TYPE_GFX, true, 0)) adev->gfx.gfx_off_state = true; } else { schedule_delayed_work(&adev->gfx.gfx_off_delay_work, @@ -818,7 +818,7 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable) cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work); if (adev->gfx.gfx_off_state && - !amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) { + !amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false, 0)) { adev->gfx.gfx_off_state = false; if (adev->gfx.funcs->init_spm_golden) { -- cgit v1.2.3 From a69f4cc278fe5285f1f42562904f4472955c20ea Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Fri, 29 Nov 2024 19:20:03 +0530 Subject: drm/amd/amdgpu: Add Descriptions to Process Isolation and Cleaner Shader Sysfs Functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This update adds explanations to key functions related to process isolation and cleaner shader execution sysfs interfaces. - `amdgpu_gfx_set_run_cleaner_shader`: Describes how to manually run a cleaner shader, which clears the Local Data Store (LDS) and General Purpose Registers (GPRs) to ensure data isolation between GPU workloads. - `amdgpu_gfx_get_enforce_isolation`: Describes how to query the current settings of the 'enforce_isolation' feature for each GPU partition. - `amdgpu_gfx_set_enforce_isolation`: Describes how to enable or disable process isolation for GPU partitions through the sysfs interface. Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Suggested-by: Alex Deucher Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index e54f42e3797e..7e2ad7818903 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1484,6 +1484,24 @@ static int amdgpu_gfx_run_cleaner_shader(struct amdgpu_device *adev, int xcp_id) return 0; } +/** + * amdgpu_gfx_set_run_cleaner_shader - Execute the AMDGPU GFX Cleaner Shader + * @dev: The device structure + * @attr: The device attribute structure + * @buf: The buffer containing the input data + * @count: The size of the input data + * + * Provides the sysfs interface to manually run a cleaner shader, which is + * used to clear the GPU state between different tasks. Writing a value to the + * 'run_cleaner_shader' sysfs file triggers the cleaner shader execution. + * The value written corresponds to the partition index on multi-partition + * devices. On single-partition devices, the value should be '0'. + * + * The cleaner shader clears the Local Data Store (LDS) and General Purpose + * Registers (GPRs) to ensure data isolation between GPU workloads. + * + * Return: The number of bytes written to the sysfs file. + */ static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev, struct device_attribute *attr, const char *buf, @@ -1532,6 +1550,19 @@ static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev, return count; } +/** + * amdgpu_gfx_get_enforce_isolation - Query AMDGPU GFX Enforce Isolation Settings + * @dev: The device structure + * @attr: The device attribute structure + * @buf: The buffer to store the output data + * + * Provides the sysfs read interface to get the current settings of the 'enforce_isolation' + * feature for each GPU partition. Reading from the 'enforce_isolation' + * sysfs file returns the isolation settings for all partitions, where '0' + * indicates disabled and '1' indicates enabled. + * + * Return: The number of bytes read from the sysfs file. + */ static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev, struct device_attribute *attr, char *buf) @@ -1555,6 +1586,20 @@ static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev, return size; } +/** + * amdgpu_gfx_set_enforce_isolation - Control AMDGPU GFX Enforce Isolation + * @dev: The device structure + * @attr: The device attribute structure + * @buf: The buffer containing the input data + * @count: The size of the input data + * + * This function allows control over the 'enforce_isolation' feature, which + * serializes access to the graphics engine. Writing '1' or '0' to the + * 'enforce_isolation' sysfs file enables or disables process isolation for + * each partition. The input should specify the setting for all partitions. + * + * Return: The number of bytes written to the sysfs file. + */ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) -- cgit v1.2.3 From 55f4139b6598bc1f8e2fca2181c2749bca84ffee Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Thu, 28 Nov 2024 01:11:36 +0530 Subject: drm/amd/amdgpu: Add Annotations to Process Isolation functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This update adds explanations to key functions that manage how the Kernel Fusion Driver (KFD) and Kernel Graphics Driver (KGD) share the GPU. amdgpu_gfx_enforce_isolation_wait_for_kfd: Controls the waiting period for KFD to ensure it takes turns with KGD in using the GPU. It uses a mutex to safely manage shared data, like timing and state, and tracks when KFD starts and stops waiting. amdgpu_gfx_enforce_isolation_ring_begin_use: Ensures KFD has enough time to run before new tasks are submitted to the GPU ring. It uses a mutex to synchronize access and may adjust the KFD scheduler. amdgpu_gfx_enforce_isolation_ring_end_use: Handles cleanup and state updates when finishing the use of a GPU ring. It may also adjust the KFD scheduler, using a mutex to manage shared data access. Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Suggested-by: Alex Deucher Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 7e2ad7818903..a4dde54512b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1985,6 +1985,17 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work) mutex_unlock(&adev->enforce_isolation_mutex); } +/** + * amdgpu_gfx_enforce_isolation_wait_for_kfd - Manage KFD wait period for process isolation + * @adev: amdgpu_device pointer + * @idx: Index of the GPU partition + * + * When kernel submissions come in, the jobs are given a time slice and once + * that time slice is up, if there are KFD user queues active, kernel + * submissions are blocked until KFD has had its time slice. Once the KFD time + * slice is up, KFD user queues are preempted and kernel submissions are + * unblocked and allowed to run again. + */ static void amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev, u32 idx) @@ -2030,6 +2041,15 @@ amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev, msleep(GFX_SLICE_PERIOD_MS); } +/** + * amdgpu_gfx_enforce_isolation_ring_begin_use - Begin use of a ring with enforced isolation + * @ring: Pointer to the amdgpu_ring structure + * + * Ring begin_use helper implementation for gfx which serializes access to the + * gfx IP between kernel submission IOCTLs and KFD user queues when isolation + * enforcement is enabled. The kernel submission IOCTLs and KFD user queues + * each get a time slice when both are active. + */ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; @@ -2057,6 +2077,15 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) mutex_unlock(&adev->enforce_isolation_mutex); } +/** + * amdgpu_gfx_enforce_isolation_ring_end_use - End use of a ring with enforced isolation + * @ring: Pointer to the amdgpu_ring structure + * + * Ring end_use helper implementation for gfx which serializes access to the + * gfx IP between kernel submission IOCTLs and KFD user queues when isolation + * enforcement is enabled. The kernel submission IOCTLs and KFD user queues + * each get a time slice when both are active. + */ void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; -- cgit v1.2.3 From 34c4eb7d4e0cd443399a0f114d467d2b3ff05419 Mon Sep 17 00:00:00 2001 From: Karol Przybylski Date: Sun, 15 Dec 2024 13:28:57 +0100 Subject: drm/amdgpu: Fix potential integer overflow in scheduler mask calculations The use of 1 << i in scheduler mask calculations can result in an unintentional integer overflow due to the expression being evaluated as a 32-bit signed integer. This patch replaces 1 << i with 1ULL << i to ensure the operation is performed as a 64-bit unsigned integer, preventing overflow Discovered in coverity scan, CID 1636393, 1636175, 1636007, 1635853 Fixes: c5c63d9cb5d3 ("drm/amdgpu: add amdgpu_gfx_sched_mask and amdgpu_compute_sched_mask debugfs") Signed-off-by: Karol Przybylski Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index a4dde54512b1..4a4e40dd25d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -2124,7 +2124,7 @@ static int amdgpu_debugfs_gfx_sched_mask_set(void *data, u64 val) if (!adev) return -ENODEV; - mask = (1 << adev->gfx.num_gfx_rings) - 1; + mask = (1ULL << adev->gfx.num_gfx_rings) - 1; if ((val & mask) == 0) return -EINVAL; @@ -2152,7 +2152,7 @@ static int amdgpu_debugfs_gfx_sched_mask_get(void *data, u64 *val) for (i = 0; i < adev->gfx.num_gfx_rings; ++i) { ring = &adev->gfx.gfx_ring[i]; if (ring->sched.ready) - mask |= 1 << i; + mask |= 1ULL << i; } *val = mask; @@ -2194,7 +2194,7 @@ static int amdgpu_debugfs_compute_sched_mask_set(void *data, u64 val) if (!adev) return -ENODEV; - mask = (1 << adev->gfx.num_compute_rings) - 1; + mask = (1ULL << adev->gfx.num_compute_rings) - 1; if ((val & mask) == 0) return -EINVAL; @@ -2223,7 +2223,7 @@ static int amdgpu_debugfs_compute_sched_mask_get(void *data, u64 *val) for (i = 0; i < adev->gfx.num_compute_rings; ++i) { ring = &adev->gfx.compute_ring[i]; if (ring->sched.ready) - mask |= 1 << i; + mask |= 1ULL << i; } *val = mask; -- cgit v1.2.3 From 11815bb0e30966321ff4351b55ad7b6f2e0a63bf Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 12 Dec 2024 16:51:04 +0100 Subject: drm/amdgpu: partially revert "reduce reset time" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This partially reverts commit 194eb174cbe4fe2b3376ac30acca2dc8c8beca00. This commit introduced a new state variable into adev without even remotely worrying about CPU barriers. Since we already have the amdgpu_in_reset() function exactly for this use case partially revert that. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 2 +- 5 files changed, 4 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9f5351c013ff..69895fccb474 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1175,7 +1175,6 @@ struct amdgpu_device { struct work_struct reset_work; - bool job_hang; bool dc_enabled; /* Mask of active clusters */ uint32_t aid_mask; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index de30143ea51b..2e5732dfd425 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -836,7 +836,7 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off, if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; - if (!kiq_ring->sched.ready || adev->job_hang) + if (!kiq_ring->sched.ready || amdgpu_in_reset(adev)) return 0; ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 4a4e40dd25d6..6d5d81f0dc4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -515,7 +515,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; - if (!kiq_ring->sched.ready || adev->job_hang || amdgpu_in_reset(adev)) + if (!kiq_ring->sched.ready || amdgpu_in_reset(adev)) return 0; spin_lock(&kiq->ring_lock); @@ -567,7 +567,7 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id) if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; - if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang) + if (!adev->gfx.kiq[0].ring.sched.ready || amdgpu_in_reset(adev)) return 0; if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 04691753cabf..100f04475943 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -102,8 +102,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) return DRM_GPU_SCHED_STAT_ENODEV; } - adev->job_hang = true; - /* * Do the coredump immediately after a job timeout to get a very * close dump/snapshot/representation of GPU's current error status @@ -181,7 +179,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) } exit: - adev->job_hang = false; drm_dev_exit(idx); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 9484f3b5a9b7..003522c2d902 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -5957,7 +5957,7 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable) else WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp); - if (adev->job_hang && !enable) + if (amdgpu_in_reset(adev) && !enable) return 0; for (i = 0; i < adev->usec_timeout; i++) { -- cgit v1.2.3 From 1e8c193f8ca7ab7dff4f4747b45a55dca23c00f4 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Thu, 9 Jan 2025 21:33:51 +0530 Subject: drm/amdgpu: Fix Circular Locking Dependency in AMDGPU GFX Isolation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit addresses a circular locking dependency issue within the GFX isolation mechanism. The problem was identified by a warning indicating a potential deadlock due to inconsistent lock acquisition order. - The `amdgpu_gfx_enforce_isolation_ring_begin_use` and `amdgpu_gfx_enforce_isolation_ring_end_use` functions previously acquired `enforce_isolation_mutex` and called `amdgpu_gfx_kfd_sch_ctrl`, leading to potential deadlocks. ie., If `amdgpu_gfx_kfd_sch_ctrl` is called while `enforce_isolation_mutex` is held, and `amdgpu_gfx_enforce_isolation_handler` is called while `kfd_sch_mutex` is held, it can create a circular dependency. By ensuring consistent lock usage, this fix resolves the issue: [ 606.297333] ====================================================== [ 606.297343] WARNING: possible circular locking dependency detected [ 606.297353] 6.10.0-amd-mlkd-610-311224-lof #19 Tainted: G OE [ 606.297365] ------------------------------------------------------ [ 606.297375] kworker/u96:3/3825 is trying to acquire lock: [ 606.297385] ffff9aa64e431cb8 ((work_completion)(&(&adev->gfx.enforce_isolation[i].work)->work)){+.+.}-{0:0}, at: __flush_work+0x232/0x610 [ 606.297413] but task is already holding lock: [ 606.297423] ffff9aa64e432338 (&adev->gfx.kfd_sch_mutex){+.+.}-{3:3}, at: amdgpu_gfx_kfd_sch_ctrl+0x51/0x4d0 [amdgpu] [ 606.297725] which lock already depends on the new lock. [ 606.297738] the existing dependency chain (in reverse order) is: [ 606.297749] -> #2 (&adev->gfx.kfd_sch_mutex){+.+.}-{3:3}: [ 606.297765] __mutex_lock+0x85/0x930 [ 606.297776] mutex_lock_nested+0x1b/0x30 [ 606.297786] amdgpu_gfx_kfd_sch_ctrl+0x51/0x4d0 [amdgpu] [ 606.298007] amdgpu_gfx_enforce_isolation_ring_begin_use+0x2a4/0x5d0 [amdgpu] [ 606.298225] amdgpu_ring_alloc+0x48/0x70 [amdgpu] [ 606.298412] amdgpu_ib_schedule+0x176/0x8a0 [amdgpu] [ 606.298603] amdgpu_job_run+0xac/0x1e0 [amdgpu] [ 606.298866] drm_sched_run_job_work+0x24f/0x430 [gpu_sched] [ 606.298880] process_one_work+0x21e/0x680 [ 606.298890] worker_thread+0x190/0x350 [ 606.298899] kthread+0xe7/0x120 [ 606.298908] ret_from_fork+0x3c/0x60 [ 606.298919] ret_from_fork_asm+0x1a/0x30 [ 606.298929] -> #1 (&adev->enforce_isolation_mutex){+.+.}-{3:3}: [ 606.298947] __mutex_lock+0x85/0x930 [ 606.298956] mutex_lock_nested+0x1b/0x30 [ 606.298966] amdgpu_gfx_enforce_isolation_handler+0x87/0x370 [amdgpu] [ 606.299190] process_one_work+0x21e/0x680 [ 606.299199] worker_thread+0x190/0x350 [ 606.299208] kthread+0xe7/0x120 [ 606.299217] ret_from_fork+0x3c/0x60 [ 606.299227] ret_from_fork_asm+0x1a/0x30 [ 606.299236] -> #0 ((work_completion)(&(&adev->gfx.enforce_isolation[i].work)->work)){+.+.}-{0:0}: [ 606.299257] __lock_acquire+0x16f9/0x2810 [ 606.299267] lock_acquire+0xd1/0x300 [ 606.299276] __flush_work+0x250/0x610 [ 606.299286] cancel_delayed_work_sync+0x71/0x80 [ 606.299296] amdgpu_gfx_kfd_sch_ctrl+0x287/0x4d0 [amdgpu] [ 606.299509] amdgpu_gfx_enforce_isolation_ring_begin_use+0x2a4/0x5d0 [amdgpu] [ 606.299723] amdgpu_ring_alloc+0x48/0x70 [amdgpu] [ 606.299909] amdgpu_ib_schedule+0x176/0x8a0 [amdgpu] [ 606.300101] amdgpu_job_run+0xac/0x1e0 [amdgpu] [ 606.300355] drm_sched_run_job_work+0x24f/0x430 [gpu_sched] [ 606.300369] process_one_work+0x21e/0x680 [ 606.300378] worker_thread+0x190/0x350 [ 606.300387] kthread+0xe7/0x120 [ 606.300396] ret_from_fork+0x3c/0x60 [ 606.300406] ret_from_fork_asm+0x1a/0x30 [ 606.300416] other info that might help us debug this: [ 606.300428] Chain exists of: (work_completion)(&(&adev->gfx.enforce_isolation[i].work)->work) --> &adev->enforce_isolation_mutex --> &adev->gfx.kfd_sch_mutex [ 606.300458] Possible unsafe locking scenario: [ 606.300468] CPU0 CPU1 [ 606.300476] ---- ---- [ 606.300484] lock(&adev->gfx.kfd_sch_mutex); [ 606.300494] lock(&adev->enforce_isolation_mutex); [ 606.300508] lock(&adev->gfx.kfd_sch_mutex); [ 606.300521] lock((work_completion)(&(&adev->gfx.enforce_isolation[i].work)->work)); [ 606.300536] *** DEADLOCK *** [ 606.300546] 5 locks held by kworker/u96:3/3825: [ 606.300555] #0: ffff9aa5aa1f5d58 ((wq_completion)comp_1.1.0){+.+.}-{0:0}, at: process_one_work+0x3f5/0x680 [ 606.300577] #1: ffffaa53c3c97e40 ((work_completion)(&sched->work_run_job)){+.+.}-{0:0}, at: process_one_work+0x1d6/0x680 [ 606.300600] #2: ffff9aa64e463c98 (&adev->enforce_isolation_mutex){+.+.}-{3:3}, at: amdgpu_gfx_enforce_isolation_ring_begin_use+0x1c3/0x5d0 [amdgpu] [ 606.300837] #3: ffff9aa64e432338 (&adev->gfx.kfd_sch_mutex){+.+.}-{3:3}, at: amdgpu_gfx_kfd_sch_ctrl+0x51/0x4d0 [amdgpu] [ 606.301062] #4: ffffffff8c1a5660 (rcu_read_lock){....}-{1:2}, at: __flush_work+0x70/0x610 [ 606.301083] stack backtrace: [ 606.301092] CPU: 14 PID: 3825 Comm: kworker/u96:3 Tainted: G OE 6.10.0-amd-mlkd-610-311224-lof #19 [ 606.301109] Hardware name: Gigabyte Technology Co., Ltd. X570S GAMING X/X570S GAMING X, BIOS F7 03/22/2024 [ 606.301124] Workqueue: comp_1.1.0 drm_sched_run_job_work [gpu_sched] [ 606.301140] Call Trace: [ 606.301146] [ 606.301154] dump_stack_lvl+0x9b/0xf0 [ 606.301166] dump_stack+0x10/0x20 [ 606.301175] print_circular_bug+0x26c/0x340 [ 606.301187] check_noncircular+0x157/0x170 [ 606.301197] ? register_lock_class+0x48/0x490 [ 606.301213] __lock_acquire+0x16f9/0x2810 [ 606.301230] lock_acquire+0xd1/0x300 [ 606.301239] ? __flush_work+0x232/0x610 [ 606.301250] ? srso_alias_return_thunk+0x5/0xfbef5 [ 606.301261] ? mark_held_locks+0x54/0x90 [ 606.301274] ? __flush_work+0x232/0x610 [ 606.301284] __flush_work+0x250/0x610 [ 606.301293] ? __flush_work+0x232/0x610 [ 606.301305] ? __pfx_wq_barrier_func+0x10/0x10 [ 606.301318] ? mark_held_locks+0x54/0x90 [ 606.301331] ? srso_alias_return_thunk+0x5/0xfbef5 [ 606.301345] cancel_delayed_work_sync+0x71/0x80 [ 606.301356] amdgpu_gfx_kfd_sch_ctrl+0x287/0x4d0 [amdgpu] [ 606.301661] amdgpu_gfx_enforce_isolation_ring_begin_use+0x2a4/0x5d0 [amdgpu] [ 606.302050] ? srso_alias_return_thunk+0x5/0xfbef5 [ 606.302069] amdgpu_ring_alloc+0x48/0x70 [amdgpu] [ 606.302452] amdgpu_ib_schedule+0x176/0x8a0 [amdgpu] [ 606.302862] ? drm_sched_entity_error+0x82/0x190 [gpu_sched] [ 606.302890] amdgpu_job_run+0xac/0x1e0 [amdgpu] [ 606.303366] drm_sched_run_job_work+0x24f/0x430 [gpu_sched] [ 606.303388] process_one_work+0x21e/0x680 [ 606.303409] worker_thread+0x190/0x350 [ 606.303424] ? __pfx_worker_thread+0x10/0x10 [ 606.303437] kthread+0xe7/0x120 [ 606.303449] ? __pfx_kthread+0x10/0x10 [ 606.303463] ret_from_fork+0x3c/0x60 [ 606.303476] ? __pfx_kthread+0x10/0x10 [ 606.303489] ret_from_fork_asm+0x1a/0x30 [ 606.303512] v2: Refactor lock handling to resolve circular dependency (Alex) - Introduced a `sched_work` flag to defer the call to `amdgpu_gfx_kfd_sch_ctrl` until after releasing `enforce_isolation_mutex`. - This change ensures that `amdgpu_gfx_kfd_sch_ctrl` is called outside the critical section, preventing the circular dependency and deadlock. - The `sched_work` flag is set within the mutex-protected section if conditions are met, and the actual function call is made afterward. - This approach ensures consistent lock acquisition order. Fixes: afefd6f24502 ("drm/amdgpu: Implement Enforce Isolation Handler for KGD/KFD serialization") Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Suggested-by: Alex Deucher Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 0b6b2dd38336d5fd49214f0e4e6495e658e3ab44) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 69a6b6dba0a5..1d155463d044 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1989,6 +1989,7 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; u32 idx; + bool sched_work = false; if (!adev->gfx.enable_cleaner_shader) return; @@ -2007,15 +2008,19 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) mutex_lock(&adev->enforce_isolation_mutex); if (adev->enforce_isolation[idx]) { if (adev->kfd.init_complete) - amdgpu_gfx_kfd_sch_ctrl(adev, idx, false); + sched_work = true; } mutex_unlock(&adev->enforce_isolation_mutex); + + if (sched_work) + amdgpu_gfx_kfd_sch_ctrl(adev, idx, false); } void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; u32 idx; + bool sched_work = false; if (!adev->gfx.enable_cleaner_shader) return; @@ -2031,9 +2036,12 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring) mutex_lock(&adev->enforce_isolation_mutex); if (adev->enforce_isolation[idx]) { if (adev->kfd.init_complete) - amdgpu_gfx_kfd_sch_ctrl(adev, idx, true); + sched_work = true; } mutex_unlock(&adev->enforce_isolation_mutex); + + if (sched_work) + amdgpu_gfx_kfd_sch_ctrl(adev, idx, true); } /* -- cgit v1.2.3