summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRuijing Dong <ruijing.dong@amd.com>2026-03-17 20:54:11 +0300
committerAlex Deucher <alexander.deucher@amd.com>2026-03-23 21:09:28 +0300
commit94d79f51efecb74be1d88dde66bdc8bfcca17935 (patch)
tree306f1c397d56707f22e28d9b915011f5bd3f405d
parentf9a4e81bcbd04e6f967d851f9fe69d8bb3cc08b3 (diff)
downloadlinux-94d79f51efecb74be1d88dde66bdc8bfcca17935.tar.xz
drm/amdgpu: fix strsep() corrupting lockup_timeout on multi-GPU (v3)
amdgpu_device_get_job_timeout_settings() passes a pointer directly to the global amdgpu_lockup_timeout[] buffer into strsep(). strsep() destructively replaces delimiter characters with '\0' in-place. On multi-GPU systems, this function is called once per device. When a multi-value setting like "0,0,0,-1" is used, the first GPU's call transforms the global buffer into "0\00\00\0-1". The second GPU then sees only "0" (terminated at the first '\0'), parses a single value, hits the single-value fallthrough (index == 1), and applies timeout=0 to all rings — causing immediate false job timeouts. Fix this by copying into a stack-local array before calling strsep(), so the global module parameter buffer remains intact across calls. The buffer is AMDGPU_MAX_TIMEOUT_PARAM_LENGTH (256) bytes, which is safe for the stack. v2: wrap commit message to 72 columns, add Assisted-by tag. v3: use stack array with strscpy() instead of kstrdup()/kfree() to avoid unnecessary heap allocation (Christian). This patch was developed with assistance from Claude (claude-opus-4-6). Assisted-by: Claude:claude-opus-4-6 Reviewed-by: Christian König <christian.koenig@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Ruijing Dong <ruijing.dong@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c13
1 files changed, 11 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ac5769d9e75c..a7038f039b10 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3498,7 +3498,8 @@ fail:
static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
{
- char *input = amdgpu_lockup_timeout;
+ char buf[AMDGPU_MAX_TIMEOUT_PARAM_LENGTH];
+ char *input = buf;
char *timeout_setting = NULL;
int index = 0;
long timeout;
@@ -3508,9 +3509,17 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
adev->video_timeout = msecs_to_jiffies(2000);
- if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
+ if (!strnlen(amdgpu_lockup_timeout, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
return 0;
+ /*
+ * strsep() destructively modifies its input by replacing delimiters
+ * with '\0'. Use a stack copy so the global module parameter buffer
+ * remains intact for multi-GPU systems where this function is called
+ * once per device.
+ */
+ strscpy(buf, amdgpu_lockup_timeout, sizeof(buf));
+
while ((timeout_setting = strsep(&input, ",")) &&
strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
ret = kstrtol(timeout_setting, 0, &timeout);