From ff69bba05f085cd6d4277c27ac7600160167b384 Mon Sep 17 00:00:00 2001
From: Boyuan Zhang <boyuan.zhang@amd.com>
Date: Wed, 2 Oct 2024 23:52:01 -0400
Subject: drm/amd/pm: add inst to dpm_set_powergating_by_smu
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an instance parameter to amdgpu_dpm_set_powergating_by_smu() function,
and use the instance to call set_powergating_by_smu().

v2: remove duplicated functions.

remove for-loop in amdgpu_dpm_set_powergating_by_smu(), and temporarily
move it to amdgpu_dpm_enable_vcn(), in order to keep the exact same logic
as before, until further separation in next patch.

v3: drop SI logic in amdgpu_dpm_enable_vcn().

Signed-off-by: Boyuan Zhang <boyuan.zhang@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 69a6b6dba0a5..e54f42e3797e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -806,7 +806,7 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
 			/* If going to s2idle, no need to wait */
 			if (adev->in_s0ix) {
 				if (!amdgpu_dpm_set_powergating_by_smu(adev,
-						AMD_IP_BLOCK_TYPE_GFX, true))
+						AMD_IP_BLOCK_TYPE_GFX, true, 0))
 					adev->gfx.gfx_off_state = true;
 			} else {
 				schedule_delayed_work(&adev->gfx.gfx_off_delay_work,
@@ -818,7 +818,7 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
 			cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
 
 			if (adev->gfx.gfx_off_state &&
-			    !amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
+			    !amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false, 0)) {
 				adev->gfx.gfx_off_state = false;
 
 				if (adev->gfx.funcs->init_spm_golden) {
-- 
cgit v1.2.3


From a69f4cc278fe5285f1f42562904f4472955c20ea Mon Sep 17 00:00:00 2001
From: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Date: Fri, 29 Nov 2024 19:20:03 +0530
Subject: drm/amd/amdgpu: Add Descriptions to Process Isolation and Cleaner
 Shader Sysfs Functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This update adds explanations to key functions related to process
isolation and cleaner shader execution sysfs interfaces.

- `amdgpu_gfx_set_run_cleaner_shader`: Describes how to manually run a
  cleaner shader, which clears the Local Data Store (LDS) and General
  Purpose Registers (GPRs) to ensure data isolation between GPU workloads.

- `amdgpu_gfx_get_enforce_isolation`: Describes how to query the current
  settings of the 'enforce_isolation' feature for each GPU partition.

- `amdgpu_gfx_set_enforce_isolation`: Describes how to enable or disable
  process isolation for GPU partitions through the sysfs interface.

Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e54f42e3797e..7e2ad7818903 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1484,6 +1484,24 @@ static int amdgpu_gfx_run_cleaner_shader(struct amdgpu_device *adev, int xcp_id)
 	return 0;
 }
 
+/**
+ * amdgpu_gfx_set_run_cleaner_shader - Execute the AMDGPU GFX Cleaner Shader
+ * @dev: The device structure
+ * @attr: The device attribute structure
+ * @buf: The buffer containing the input data
+ * @count: The size of the input data
+ *
+ * Provides the sysfs interface to manually run a cleaner shader, which is
+ * used to clear the GPU state between different tasks. Writing a value to the
+ * 'run_cleaner_shader' sysfs file triggers the cleaner shader execution.
+ * The value written corresponds to the partition index on multi-partition
+ * devices. On single-partition devices, the value should be '0'.
+ *
+ * The cleaner shader clears the Local Data Store (LDS) and General Purpose
+ * Registers (GPRs) to ensure data isolation between GPU workloads.
+ *
+ * Return: The number of bytes written to the sysfs file.
+ */
 static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev,
 						 struct device_attribute *attr,
 						 const char *buf,
@@ -1532,6 +1550,19 @@ static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev,
 	return count;
 }
 
+/**
+ * amdgpu_gfx_get_enforce_isolation - Query AMDGPU GFX Enforce Isolation Settings
+ * @dev: The device structure
+ * @attr: The device attribute structure
+ * @buf: The buffer to store the output data
+ *
+ * Provides the sysfs read interface to get the current settings of the 'enforce_isolation'
+ * feature for each GPU partition. Reading from the 'enforce_isolation'
+ * sysfs file returns the isolation settings for all partitions, where '0'
+ * indicates disabled and '1' indicates enabled.
+ *
+ * Return: The number of bytes read from the sysfs file.
+ */
 static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev,
 						struct device_attribute *attr,
 						char *buf)
@@ -1555,6 +1586,20 @@ static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev,
 	return size;
 }
 
+/**
+ * amdgpu_gfx_set_enforce_isolation - Control AMDGPU GFX Enforce Isolation
+ * @dev: The device structure
+ * @attr: The device attribute structure
+ * @buf: The buffer containing the input data
+ * @count: The size of the input data
+ *
+ * This function allows control over the 'enforce_isolation' feature, which
+ * serializes access to the graphics engine. Writing '1' or '0' to the
+ * 'enforce_isolation' sysfs file enables or disables process isolation for
+ * each partition. The input should specify the setting for all partitions.
+ *
+ * Return: The number of bytes written to the sysfs file.
+ */
 static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
 						struct device_attribute *attr,
 						const char *buf, size_t count)
-- 
cgit v1.2.3


From 55f4139b6598bc1f8e2fca2181c2749bca84ffee Mon Sep 17 00:00:00 2001
From: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Date: Thu, 28 Nov 2024 01:11:36 +0530
Subject: drm/amd/amdgpu: Add Annotations to Process Isolation functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This update adds explanations to key functions that manage how the
Kernel Fusion Driver (KFD) and Kernel Graphics Driver (KGD) share the
GPU.

amdgpu_gfx_enforce_isolation_wait_for_kfd: Controls the waiting period
for KFD to ensure it takes turns with KGD in using the GPU. It uses a
mutex to safely manage shared data, like timing and state, and tracks
when KFD starts and stops waiting.

amdgpu_gfx_enforce_isolation_ring_begin_use: Ensures KFD has enough time
to run before new tasks are submitted to the GPU ring. It uses a mutex
to synchronize access and may adjust the KFD scheduler.

amdgpu_gfx_enforce_isolation_ring_end_use: Handles cleanup and state
updates when finishing the use of a GPU ring. It may also adjust the KFD
scheduler, using a mutex to manage shared data access.

Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 7e2ad7818903..a4dde54512b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1985,6 +1985,17 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
 	mutex_unlock(&adev->enforce_isolation_mutex);
 }
 
+/**
+ * amdgpu_gfx_enforce_isolation_wait_for_kfd - Manage KFD wait period for process isolation
+ * @adev: amdgpu_device pointer
+ * @idx: Index of the GPU partition
+ *
+ * When kernel submissions come in, the jobs are given a time slice and once
+ * that time slice is up, if there are KFD user queues active, kernel
+ * submissions are blocked until KFD has had its time slice. Once the KFD time
+ * slice is up, KFD user queues are preempted and kernel submissions are
+ * unblocked and allowed to run again.
+ */
 static void
 amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev,
 					  u32 idx)
@@ -2030,6 +2041,15 @@ amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev,
 		msleep(GFX_SLICE_PERIOD_MS);
 }
 
+/**
+ * amdgpu_gfx_enforce_isolation_ring_begin_use - Begin use of a ring with enforced isolation
+ * @ring: Pointer to the amdgpu_ring structure
+ *
+ * Ring begin_use helper implementation for gfx which serializes access to the
+ * gfx IP between kernel submission IOCTLs and KFD user queues when isolation
+ * enforcement is enabled. The kernel submission IOCTLs and KFD user queues
+ * each get a time slice when both are active.
+ */
 void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
@@ -2057,6 +2077,15 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
 	mutex_unlock(&adev->enforce_isolation_mutex);
 }
 
+/**
+ * amdgpu_gfx_enforce_isolation_ring_end_use - End use of a ring with enforced isolation
+ * @ring: Pointer to the amdgpu_ring structure
+ *
+ * Ring end_use helper implementation for gfx which serializes access to the
+ * gfx IP between kernel submission IOCTLs and KFD user queues when isolation
+ * enforcement is enabled. The kernel submission IOCTLs and KFD user queues
+ * each get a time slice when both are active.
+ */
 void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
-- 
cgit v1.2.3


From 34c4eb7d4e0cd443399a0f114d467d2b3ff05419 Mon Sep 17 00:00:00 2001
From: Karol Przybylski <karprzy7@gmail.com>
Date: Sun, 15 Dec 2024 13:28:57 +0100
Subject: drm/amdgpu: Fix potential integer overflow in scheduler mask
 calculations

The use of 1 << i in scheduler mask calculations can result in an
unintentional integer overflow due to the expression being
evaluated as a 32-bit signed integer.

This patch replaces 1 << i with 1ULL << i to ensure the operation
is performed as a 64-bit unsigned integer, preventing overflow

Discovered in coverity scan, CID 1636393, 1636175, 1636007, 1635853

Fixes: c5c63d9cb5d3 ("drm/amdgpu: add amdgpu_gfx_sched_mask and amdgpu_compute_sched_mask debugfs")
Signed-off-by: Karol Przybylski <karprzy7@gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a4dde54512b1..4a4e40dd25d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -2124,7 +2124,7 @@ static int amdgpu_debugfs_gfx_sched_mask_set(void *data, u64 val)
 	if (!adev)
 		return -ENODEV;
 
-	mask = (1 << adev->gfx.num_gfx_rings) - 1;
+	mask = (1ULL << adev->gfx.num_gfx_rings) - 1;
 	if ((val & mask) == 0)
 		return -EINVAL;
 
@@ -2152,7 +2152,7 @@ static int amdgpu_debugfs_gfx_sched_mask_get(void *data, u64 *val)
 	for (i = 0; i < adev->gfx.num_gfx_rings; ++i) {
 		ring = &adev->gfx.gfx_ring[i];
 		if (ring->sched.ready)
-			mask |= 1 << i;
+			mask |= 1ULL << i;
 	}
 
 	*val = mask;
@@ -2194,7 +2194,7 @@ static int amdgpu_debugfs_compute_sched_mask_set(void *data, u64 val)
 	if (!adev)
 		return -ENODEV;
 
-	mask = (1 << adev->gfx.num_compute_rings) - 1;
+	mask = (1ULL << adev->gfx.num_compute_rings) - 1;
 	if ((val & mask) == 0)
 		return -EINVAL;
 
@@ -2223,7 +2223,7 @@ static int amdgpu_debugfs_compute_sched_mask_get(void *data, u64 *val)
 	for (i = 0; i < adev->gfx.num_compute_rings; ++i) {
 		ring = &adev->gfx.compute_ring[i];
 		if (ring->sched.ready)
-			mask |= 1 << i;
+			mask |= 1ULL << i;
 	}
 
 	*val = mask;
-- 
cgit v1.2.3


From 11815bb0e30966321ff4351b55ad7b6f2e0a63bf Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Thu, 12 Dec 2024 16:51:04 +0100
Subject: drm/amdgpu: partially revert "reduce reset time"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This partially reverts commit 194eb174cbe4fe2b3376ac30acca2dc8c8beca00.

This commit introduced a new state variable into adev without even
remotely worrying about CPU barriers.

Since we already have the amdgpu_in_reset() function exactly for this
use case partially revert that.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 3 ---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c     | 2 +-
 5 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9f5351c013ff..69895fccb474 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1175,7 +1175,6 @@ struct amdgpu_device {
 
 	struct work_struct		reset_work;
 
-	bool                            job_hang;
 	bool                            dc_enabled;
 	/* Mask of active clusters */
 	uint32_t			aid_mask;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index de30143ea51b..2e5732dfd425 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -836,7 +836,7 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
 	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
 		return -EINVAL;
 
-	if (!kiq_ring->sched.ready || adev->job_hang)
+	if (!kiq_ring->sched.ready || amdgpu_in_reset(adev))
 		return 0;
 
 	ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 4a4e40dd25d6..6d5d81f0dc4e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -515,7 +515,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
 	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
 		return -EINVAL;
 
-	if (!kiq_ring->sched.ready || adev->job_hang || amdgpu_in_reset(adev))
+	if (!kiq_ring->sched.ready || amdgpu_in_reset(adev))
 		return 0;
 
 	spin_lock(&kiq->ring_lock);
@@ -567,7 +567,7 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id)
 	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
 		return -EINVAL;
 
-	if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang)
+	if (!adev->gfx.kiq[0].ring.sched.ready || amdgpu_in_reset(adev))
 		return 0;
 
 	if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 04691753cabf..100f04475943 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -102,8 +102,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 		return DRM_GPU_SCHED_STAT_ENODEV;
 	}
 
-	adev->job_hang = true;
-
 	/*
 	 * Do the coredump immediately after a job timeout to get a very
 	 * close dump/snapshot/representation of GPU's current error status
@@ -181,7 +179,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	}
 
 exit:
-	adev->job_hang = false;
 	drm_dev_exit(idx);
 	return DRM_GPU_SCHED_STAT_NOMINAL;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 9484f3b5a9b7..003522c2d902 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5957,7 +5957,7 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
 	else
 		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
 
-	if (adev->job_hang && !enable)
+	if (amdgpu_in_reset(adev) && !enable)
 		return 0;
 
 	for (i = 0; i < adev->usec_timeout; i++) {
-- 
cgit v1.2.3


From 1e8c193f8ca7ab7dff4f4747b45a55dca23c00f4 Mon Sep 17 00:00:00 2001
From: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Date: Thu, 9 Jan 2025 21:33:51 +0530
Subject: drm/amdgpu: Fix Circular Locking Dependency in AMDGPU GFX Isolation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit addresses a circular locking dependency issue within the GFX
isolation mechanism. The problem was identified by a warning indicating
a potential deadlock due to inconsistent lock acquisition order.

- The `amdgpu_gfx_enforce_isolation_ring_begin_use` and
  `amdgpu_gfx_enforce_isolation_ring_end_use` functions previously
  acquired `enforce_isolation_mutex` and called `amdgpu_gfx_kfd_sch_ctrl`,
  leading to potential deadlocks. ie., If `amdgpu_gfx_kfd_sch_ctrl` is
  called while `enforce_isolation_mutex` is held, and
  `amdgpu_gfx_enforce_isolation_handler` is called while `kfd_sch_mutex` is
  held, it can create a circular dependency.

By ensuring consistent lock usage, this fix resolves the issue:

[  606.297333] ======================================================
[  606.297343] WARNING: possible circular locking dependency detected
[  606.297353] 6.10.0-amd-mlkd-610-311224-lof #19 Tainted: G           OE
[  606.297365] ------------------------------------------------------
[  606.297375] kworker/u96:3/3825 is trying to acquire lock:
[  606.297385] ffff9aa64e431cb8 ((work_completion)(&(&adev->gfx.enforce_isolation[i].work)->work)){+.+.}-{0:0}, at: __flush_work+0x232/0x610
[  606.297413]
               but task is already holding lock:
[  606.297423] ffff9aa64e432338 (&adev->gfx.kfd_sch_mutex){+.+.}-{3:3}, at: amdgpu_gfx_kfd_sch_ctrl+0x51/0x4d0 [amdgpu]
[  606.297725]
               which lock already depends on the new lock.

[  606.297738]
               the existing dependency chain (in reverse order) is:
[  606.297749]
               -> #2 (&adev->gfx.kfd_sch_mutex){+.+.}-{3:3}:
[  606.297765]        __mutex_lock+0x85/0x930
[  606.297776]        mutex_lock_nested+0x1b/0x30
[  606.297786]        amdgpu_gfx_kfd_sch_ctrl+0x51/0x4d0 [amdgpu]
[  606.298007]        amdgpu_gfx_enforce_isolation_ring_begin_use+0x2a4/0x5d0 [amdgpu]
[  606.298225]        amdgpu_ring_alloc+0x48/0x70 [amdgpu]
[  606.298412]        amdgpu_ib_schedule+0x176/0x8a0 [amdgpu]
[  606.298603]        amdgpu_job_run+0xac/0x1e0 [amdgpu]
[  606.298866]        drm_sched_run_job_work+0x24f/0x430 [gpu_sched]
[  606.298880]        process_one_work+0x21e/0x680
[  606.298890]        worker_thread+0x190/0x350
[  606.298899]        kthread+0xe7/0x120
[  606.298908]        ret_from_fork+0x3c/0x60
[  606.298919]        ret_from_fork_asm+0x1a/0x30
[  606.298929]
               -> #1 (&adev->enforce_isolation_mutex){+.+.}-{3:3}:
[  606.298947]        __mutex_lock+0x85/0x930
[  606.298956]        mutex_lock_nested+0x1b/0x30
[  606.298966]        amdgpu_gfx_enforce_isolation_handler+0x87/0x370 [amdgpu]
[  606.299190]        process_one_work+0x21e/0x680
[  606.299199]        worker_thread+0x190/0x350
[  606.299208]        kthread+0xe7/0x120
[  606.299217]        ret_from_fork+0x3c/0x60
[  606.299227]        ret_from_fork_asm+0x1a/0x30
[  606.299236]
               -> #0 ((work_completion)(&(&adev->gfx.enforce_isolation[i].work)->work)){+.+.}-{0:0}:
[  606.299257]        __lock_acquire+0x16f9/0x2810
[  606.299267]        lock_acquire+0xd1/0x300
[  606.299276]        __flush_work+0x250/0x610
[  606.299286]        cancel_delayed_work_sync+0x71/0x80
[  606.299296]        amdgpu_gfx_kfd_sch_ctrl+0x287/0x4d0 [amdgpu]
[  606.299509]        amdgpu_gfx_enforce_isolation_ring_begin_use+0x2a4/0x5d0 [amdgpu]
[  606.299723]        amdgpu_ring_alloc+0x48/0x70 [amdgpu]
[  606.299909]        amdgpu_ib_schedule+0x176/0x8a0 [amdgpu]
[  606.300101]        amdgpu_job_run+0xac/0x1e0 [amdgpu]
[  606.300355]        drm_sched_run_job_work+0x24f/0x430 [gpu_sched]
[  606.300369]        process_one_work+0x21e/0x680
[  606.300378]        worker_thread+0x190/0x350
[  606.300387]        kthread+0xe7/0x120
[  606.300396]        ret_from_fork+0x3c/0x60
[  606.300406]        ret_from_fork_asm+0x1a/0x30
[  606.300416]
               other info that might help us debug this:

[  606.300428] Chain exists of:
                 (work_completion)(&(&adev->gfx.enforce_isolation[i].work)->work) --> &adev->enforce_isolation_mutex --> &adev->gfx.kfd_sch_mutex

[  606.300458]  Possible unsafe locking scenario:

[  606.300468]        CPU0                    CPU1
[  606.300476]        ----                    ----
[  606.300484]   lock(&adev->gfx.kfd_sch_mutex);
[  606.300494]                                lock(&adev->enforce_isolation_mutex);
[  606.300508]                                lock(&adev->gfx.kfd_sch_mutex);
[  606.300521]   lock((work_completion)(&(&adev->gfx.enforce_isolation[i].work)->work));
[  606.300536]
                *** DEADLOCK ***

[  606.300546] 5 locks held by kworker/u96:3/3825:
[  606.300555]  #0: ffff9aa5aa1f5d58 ((wq_completion)comp_1.1.0){+.+.}-{0:0}, at: process_one_work+0x3f5/0x680
[  606.300577]  #1: ffffaa53c3c97e40 ((work_completion)(&sched->work_run_job)){+.+.}-{0:0}, at: process_one_work+0x1d6/0x680
[  606.300600]  #2: ffff9aa64e463c98 (&adev->enforce_isolation_mutex){+.+.}-{3:3}, at: amdgpu_gfx_enforce_isolation_ring_begin_use+0x1c3/0x5d0 [amdgpu]
[  606.300837]  #3: ffff9aa64e432338 (&adev->gfx.kfd_sch_mutex){+.+.}-{3:3}, at: amdgpu_gfx_kfd_sch_ctrl+0x51/0x4d0 [amdgpu]
[  606.301062]  #4: ffffffff8c1a5660 (rcu_read_lock){....}-{1:2}, at: __flush_work+0x70/0x610
[  606.301083]
               stack backtrace:
[  606.301092] CPU: 14 PID: 3825 Comm: kworker/u96:3 Tainted: G           OE      6.10.0-amd-mlkd-610-311224-lof #19
[  606.301109] Hardware name: Gigabyte Technology Co., Ltd. X570S GAMING X/X570S GAMING X, BIOS F7 03/22/2024
[  606.301124] Workqueue: comp_1.1.0 drm_sched_run_job_work [gpu_sched]
[  606.301140] Call Trace:
[  606.301146]  <TASK>
[  606.301154]  dump_stack_lvl+0x9b/0xf0
[  606.301166]  dump_stack+0x10/0x20
[  606.301175]  print_circular_bug+0x26c/0x340
[  606.301187]  check_noncircular+0x157/0x170
[  606.301197]  ? register_lock_class+0x48/0x490
[  606.301213]  __lock_acquire+0x16f9/0x2810
[  606.301230]  lock_acquire+0xd1/0x300
[  606.301239]  ? __flush_work+0x232/0x610
[  606.301250]  ? srso_alias_return_thunk+0x5/0xfbef5
[  606.301261]  ? mark_held_locks+0x54/0x90
[  606.301274]  ? __flush_work+0x232/0x610
[  606.301284]  __flush_work+0x250/0x610
[  606.301293]  ? __flush_work+0x232/0x610
[  606.301305]  ? __pfx_wq_barrier_func+0x10/0x10
[  606.301318]  ? mark_held_locks+0x54/0x90
[  606.301331]  ? srso_alias_return_thunk+0x5/0xfbef5
[  606.301345]  cancel_delayed_work_sync+0x71/0x80
[  606.301356]  amdgpu_gfx_kfd_sch_ctrl+0x287/0x4d0 [amdgpu]
[  606.301661]  amdgpu_gfx_enforce_isolation_ring_begin_use+0x2a4/0x5d0 [amdgpu]
[  606.302050]  ? srso_alias_return_thunk+0x5/0xfbef5
[  606.302069]  amdgpu_ring_alloc+0x48/0x70 [amdgpu]
[  606.302452]  amdgpu_ib_schedule+0x176/0x8a0 [amdgpu]
[  606.302862]  ? drm_sched_entity_error+0x82/0x190 [gpu_sched]
[  606.302890]  amdgpu_job_run+0xac/0x1e0 [amdgpu]
[  606.303366]  drm_sched_run_job_work+0x24f/0x430 [gpu_sched]
[  606.303388]  process_one_work+0x21e/0x680
[  606.303409]  worker_thread+0x190/0x350
[  606.303424]  ? __pfx_worker_thread+0x10/0x10
[  606.303437]  kthread+0xe7/0x120
[  606.303449]  ? __pfx_kthread+0x10/0x10
[  606.303463]  ret_from_fork+0x3c/0x60
[  606.303476]  ? __pfx_kthread+0x10/0x10
[  606.303489]  ret_from_fork_asm+0x1a/0x30
[  606.303512]  </TASK>

v2: Refactor lock handling to resolve circular dependency (Alex)

- Introduced a `sched_work` flag to defer the call to
  `amdgpu_gfx_kfd_sch_ctrl` until after releasing
  `enforce_isolation_mutex`.
- This change ensures that `amdgpu_gfx_kfd_sch_ctrl` is called outside
  the critical section, preventing the circular dependency and deadlock.
- The `sched_work` flag is set within the mutex-protected section if
  conditions are met, and the actual function call is made afterward.
- This approach ensures consistent lock acquisition order.

Fixes: afefd6f24502 ("drm/amdgpu: Implement Enforce Isolation Handler for KGD/KFD serialization")
Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 0b6b2dd38336d5fd49214f0e4e6495e658e3ab44)
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 69a6b6dba0a5..1d155463d044 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1989,6 +1989,7 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
 	u32 idx;
+	bool sched_work = false;
 
 	if (!adev->gfx.enable_cleaner_shader)
 		return;
@@ -2007,15 +2008,19 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
 	mutex_lock(&adev->enforce_isolation_mutex);
 	if (adev->enforce_isolation[idx]) {
 		if (adev->kfd.init_complete)
-			amdgpu_gfx_kfd_sch_ctrl(adev, idx, false);
+			sched_work = true;
 	}
 	mutex_unlock(&adev->enforce_isolation_mutex);
+
+	if (sched_work)
+		amdgpu_gfx_kfd_sch_ctrl(adev, idx, false);
 }
 
 void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring)
 {
 	struct amdgpu_device *adev = ring->adev;
 	u32 idx;
+	bool sched_work = false;
 
 	if (!adev->gfx.enable_cleaner_shader)
 		return;
@@ -2031,9 +2036,12 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring)
 	mutex_lock(&adev->enforce_isolation_mutex);
 	if (adev->enforce_isolation[idx]) {
 		if (adev->kfd.init_complete)
-			amdgpu_gfx_kfd_sch_ctrl(adev, idx, true);
+			sched_work = true;
 	}
 	mutex_unlock(&adev->enforce_isolation_mutex);
+
+	if (sched_work)
+		amdgpu_gfx_kfd_sch_ctrl(adev, idx, true);
 }
 
 /*
-- 
cgit v1.2.3