From 1741281a157fcfacf46dbb6ce39c13bf2699d371 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 24 May 2024 12:37:50 -0400 Subject: drm/amdgpu/gfx10: add ring reset callbacks Add ring reset callbacks for gfx and compute. v2: fix gfx handling v3: wait for KIQ to complete Acked-by: Vitaly Prosyak Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 91 ++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index ca983a014ba0..7eff39562732 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -9416,6 +9416,95 @@ static void gfx_v10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop) amdgpu_ring_write(ring, ring->funcs->nop); } +static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid) +{ + struct amdgpu_device *adev = ring->adev; + struct amdgpu_kiq *kiq = &adev->gfx.kiq[0]; + struct amdgpu_ring *kiq_ring = &kiq->ring; + unsigned long flags; + u32 tmp; + u64 addr; + int r; + + if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) + return -EINVAL; + + spin_lock_irqsave(&kiq->ring_lock, flags); + + if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7 + kiq->pmf->map_queues_size)) { + spin_unlock_irqrestore(&kiq->ring_lock, flags); + return -ENOMEM; + } + + addr = amdgpu_bo_gpu_offset(ring->mqd_obj) + + offsetof(struct v10_gfx_mqd, cp_gfx_hqd_active); + tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << vmid); + if (ring->pipe == 0) + tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE0_QUEUES, 1 << ring->queue); + else + tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE1_QUEUES, 1 << ring->queue); + + gfx_v10_0_ring_emit_wreg(kiq_ring, + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), tmp); + gfx_v10_0_wait_reg_mem(kiq_ring, 0, 1, 0, + lower_32_bits(addr), upper_32_bits(addr), + 0, 1, 0x20); + gfx_v10_0_ring_emit_reg_wait(kiq_ring, + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0, 0xffffffff); + kiq->pmf->kiq_map_queues(kiq_ring, ring); + amdgpu_ring_commit(kiq_ring); + + spin_unlock_irqrestore(&kiq->ring_lock, flags); + + r = amdgpu_ring_test_ring(kiq_ring); + if (r) + return r; + + /* reset the ring */ + ring->wptr = 0; + *ring->wptr_cpu_addr = 0; + amdgpu_ring_clear_ring(ring); + + return amdgpu_ring_test_ring(ring); +} + +static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, + unsigned int vmid) +{ + struct amdgpu_device *adev = ring->adev; + struct amdgpu_kiq *kiq = &adev->gfx.kiq[0]; + struct amdgpu_ring *kiq_ring = &kiq->ring; + unsigned long flags; + int r; + + if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) + return -EINVAL; + + spin_lock_irqsave(&kiq->ring_lock, flags); + + if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) { + spin_unlock_irqrestore(&kiq->ring_lock, flags); + return -ENOMEM; + } + + kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, + 0, 0); + amdgpu_ring_commit(kiq_ring); + + spin_unlock_irqrestore(&kiq->ring_lock, flags); + + r = amdgpu_ring_test_ring(kiq_ring); + if (r) + return r; + + /* reset the ring */ + ring->wptr = 0; + *ring->wptr_cpu_addr = 0; + amdgpu_ring_clear_ring(ring); + + return amdgpu_ring_test_ring(ring); +} + static void gfx_v10_ip_print(void *handle, struct drm_printer *p) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -9619,6 +9708,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = { .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait, .soft_recovery = gfx_v10_0_ring_soft_recovery, .emit_mem_sync = gfx_v10_0_emit_mem_sync, + .reset = gfx_v10_0_reset_kgq, }; static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = { @@ -9655,6 +9745,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = { .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait, .soft_recovery = gfx_v10_0_ring_soft_recovery, .emit_mem_sync = gfx_v10_0_emit_mem_sync, + .reset = gfx_v10_0_reset_kcq, }; static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = { -- cgit v1.2.3 From 2f3806f781421ce6dfa60471eb2116627c0eb893 Mon Sep 17 00:00:00 2001 From: Jiadong Zhu Date: Fri, 14 Jun 2024 13:46:36 +0800 Subject: drm/amdgpu/gfx10: remap queue after reset successfully Kiq command unmap_queues only does the dequeueing action. We have to map the queue back with clean mqd. v2: fix up error handling (Alex) Acked-by: Vitaly Prosyak Signed-off-by: Jiadong Zhu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 46 ++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 7eff39562732..8038b62feb84 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -7030,13 +7030,13 @@ static int gfx_v10_0_kiq_init_queue(struct amdgpu_ring *ring) return 0; } -static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring) +static int gfx_v10_0_kcq_init_queue(struct amdgpu_ring *ring, bool restore) { struct amdgpu_device *adev = ring->adev; struct v10_compute_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.compute_ring[0]; - if (!amdgpu_in_reset(adev) && !adev->in_suspend) { + if (!restore && !amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(*mqd)); mutex_lock(&adev->srbm_mutex); nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); @@ -7098,7 +7098,7 @@ static int gfx_v10_0_kcq_resume(struct amdgpu_device *adev) goto done; r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr); if (!r) { - r = gfx_v10_0_kcq_init_queue(ring); + r = gfx_v10_0_kcq_init_queue(ring, false); amdgpu_bo_kunmap(ring->mqd_obj); ring->mqd_ptr = NULL; } @@ -9483,25 +9483,49 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, spin_lock_irqsave(&kiq->ring_lock, flags); if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) { - spin_unlock_irqrestore(&kiq->ring_lock, flags); - return -ENOMEM; + r = -ENOMEM; + goto out; } kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0); amdgpu_ring_commit(kiq_ring); - spin_unlock_irqrestore(&kiq->ring_lock, flags); + r = amdgpu_ring_test_ring(kiq_ring); + if (r) + goto out; + + r = amdgpu_bo_reserve(ring->mqd_obj, false); + if (unlikely(r != 0)) { + dev_err(adev->dev, "fail to resv mqd_obj\n"); + goto out; + } + r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr); + if (!r) { + r = gfx_v10_0_kcq_init_queue(ring, true); + amdgpu_bo_kunmap(ring->mqd_obj); + ring->mqd_ptr = NULL; + } + amdgpu_bo_unreserve(ring->mqd_obj); + if (r) { + dev_err(adev->dev, "fail to unresv mqd_obj\n"); + goto out; + } + + if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->map_queues_size)) { + r = -ENOMEM; + goto out; + } + kiq->pmf->kiq_map_queues(kiq_ring, ring); + amdgpu_ring_commit(kiq_ring); r = amdgpu_ring_test_ring(kiq_ring); + +out: + spin_unlock_irqrestore(&kiq->ring_lock, flags); if (r) return r; - /* reset the ring */ - ring->wptr = 0; - *ring->wptr_cpu_addr = 0; - amdgpu_ring_clear_ring(ring); - return amdgpu_ring_test_ring(ring); } -- cgit v1.2.3 From 097af47d3cfb99ee02afbdd7e0d9596eb012c65a Mon Sep 17 00:00:00 2001 From: Jiadong Zhu Date: Tue, 2 Jul 2024 09:17:14 +0800 Subject: drm/amdgpu/gfx10: wait for reset done before remap There is a racing condition that cp firmware modifies MQD in reset sequence after driver updates it for remapping. We have to wait till CP_HQD_ACTIVE becoming false then remap the queue. v2: fix KIQ locking (Alex) v3: fix KIQ locking harder (Jessie) Acked-by: Vitaly Prosyak Signed-off-by: Jiadong Zhu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 8038b62feb84..ad113fb67a85 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -9475,7 +9475,7 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, struct amdgpu_kiq *kiq = &adev->gfx.kiq[0]; struct amdgpu_ring *kiq_ring = &kiq->ring; unsigned long flags; - int r; + int i, r; if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; @@ -9483,22 +9483,42 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, spin_lock_irqsave(&kiq->ring_lock, flags); if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) { - r = -ENOMEM; - goto out; + spin_unlock_irqrestore(&kiq->ring_lock, flags); + return -ENOMEM; } kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0); amdgpu_ring_commit(kiq_ring); + spin_unlock_irqrestore(&kiq->ring_lock, flags); r = amdgpu_ring_test_ring(kiq_ring); if (r) - goto out; + return r; + + /* make sure dequeue is complete*/ + gfx_v10_0_set_safe_mode(adev, 0); + mutex_lock(&adev->srbm_mutex); + nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); + for (i = 0; i < adev->usec_timeout; i++) { + if (!(RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE) & 1)) + break; + udelay(1); + } + if (i >= adev->usec_timeout) + r = -ETIMEDOUT; + nv_grbm_select(adev, 0, 0, 0, 0); + mutex_unlock(&adev->srbm_mutex); + gfx_v10_0_unset_safe_mode(adev, 0); + if (r) { + dev_err(adev->dev, "fail to wait on hqd deactivate\n"); + return r; + } r = amdgpu_bo_reserve(ring->mqd_obj, false); if (unlikely(r != 0)) { dev_err(adev->dev, "fail to resv mqd_obj\n"); - goto out; + return r; } r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr); if (!r) { @@ -9509,20 +9529,19 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, amdgpu_bo_unreserve(ring->mqd_obj); if (r) { dev_err(adev->dev, "fail to unresv mqd_obj\n"); - goto out; + return r; } + spin_lock_irqsave(&kiq->ring_lock, flags); if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->map_queues_size)) { - r = -ENOMEM; - goto out; + spin_unlock_irqrestore(&kiq->ring_lock, flags); + return -ENOMEM; } kiq->pmf->kiq_map_queues(kiq_ring, ring); amdgpu_ring_commit(kiq_ring); + spin_unlock_irqrestore(&kiq->ring_lock, flags); r = amdgpu_ring_test_ring(kiq_ring); - -out: - spin_unlock_irqrestore(&kiq->ring_lock, flags); if (r) return r; -- cgit v1.2.3 From d1f214432110748603dd310fbe6099df875e6f04 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 1 Jul 2024 18:14:14 -0400 Subject: drm/amdgpu/gfx10: rework reset sequence To match other GFX IPs. Acked-by: Vitaly Prosyak Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index ad113fb67a85..05f8b1495714 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -6692,13 +6692,13 @@ static int gfx_v10_0_gfx_mqd_init(struct amdgpu_device *adev, void *m, return 0; } -static int gfx_v10_0_gfx_init_queue(struct amdgpu_ring *ring) +static int gfx_v10_0_kgq_init_queue(struct amdgpu_ring *ring, bool reset) { struct amdgpu_device *adev = ring->adev; struct v10_gfx_mqd *mqd = ring->mqd_ptr; int mqd_idx = ring - &adev->gfx.gfx_ring[0]; - if (!amdgpu_in_reset(adev) && !adev->in_suspend) { + if (!reset && !amdgpu_in_reset(adev) && !adev->in_suspend) { memset((void *)mqd, 0, sizeof(*mqd)); mutex_lock(&adev->srbm_mutex); nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); @@ -6750,7 +6750,7 @@ static int gfx_v10_0_cp_async_gfx_ring_resume(struct amdgpu_device *adev) r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr); if (!r) { - r = gfx_v10_0_gfx_init_queue(ring); + r = gfx_v10_0_kgq_init_queue(ring, false); amdgpu_bo_kunmap(ring->mqd_obj); ring->mqd_ptr = NULL; } @@ -9460,10 +9460,22 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid) if (r) return r; - /* reset the ring */ - ring->wptr = 0; - *ring->wptr_cpu_addr = 0; - amdgpu_ring_clear_ring(ring); + r = amdgpu_bo_reserve(ring->mqd_obj, false); + if (unlikely(r != 0)) { + DRM_ERROR("fail to resv mqd_obj\n"); + return r; + } + r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr); + if (!r) { + r = gfx_v10_0_kgq_init_queue(ring, true); + amdgpu_bo_kunmap(ring->mqd_obj); + ring->mqd_ptr = NULL; + } + amdgpu_bo_unreserve(ring->mqd_obj); + if (r) { + DRM_ERROR("fail to unresv mqd_obj\n"); + return r; + } return amdgpu_ring_test_ring(ring); } -- cgit v1.2.3 From 4d5ddfa4b1b2bf1e936b8d38735910480f9545ed Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 18 Jul 2024 10:21:21 -0400 Subject: drm/amdgpu/gfx10: per queue reset only on bare metal It's not supported under SR-IOV at the moment. Acked-by: Vitaly Prosyak Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 05f8b1495714..fca5aee55f5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -9426,6 +9426,9 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid) u64 addr; int r; + if (amdgpu_sriov_vf(adev)) + return -EINVAL; + if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; @@ -9489,6 +9492,9 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, unsigned long flags; int i, r; + if (amdgpu_sriov_vf(adev)) + return -EINVAL; + if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; -- cgit v1.2.3 From bcee4c3f89879e15ef57f3217ded97f00b1334d3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 24 Jul 2024 18:07:28 -0400 Subject: drm/amdgpu/gfx10: use proper rlc safe mode helpers Rather than open coding it for the queue reset. Acked-by: Vitaly Prosyak Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index fca5aee55f5c..fba1e5cba941 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -9515,7 +9515,7 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, return r; /* make sure dequeue is complete*/ - gfx_v10_0_set_safe_mode(adev, 0); + amdgpu_gfx_rlc_enter_safe_mode(adev, 0); mutex_lock(&adev->srbm_mutex); nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0); for (i = 0; i < adev->usec_timeout; i++) { @@ -9527,7 +9527,7 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, r = -ETIMEDOUT; nv_grbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); - gfx_v10_0_unset_safe_mode(adev, 0); + amdgpu_gfx_rlc_exit_safe_mode(adev, 0); if (r) { dev_err(adev->dev, "fail to wait on hqd deactivate\n"); return r; -- cgit v1.2.3 From ead60e9c4e29c8574cae1be4fe3af1d9a978fb0f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 24 Jul 2024 18:20:34 -0400 Subject: drm/amdgpu/gfx10: use rlc safe mode for soft recovery Protect the MMIO access with safe mode. Acked-by: Vitaly Prosyak Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index fba1e5cba941..45ed97038df0 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -8949,7 +8949,9 @@ static void gfx_v10_0_ring_soft_recovery(struct amdgpu_ring *ring, value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01); value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1); value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid); + amdgpu_gfx_rlc_enter_safe_mode(adev, 0); WREG32_SOC15(GC, 0, mmSQ_CMD, value); + amdgpu_gfx_rlc_exit_safe_mode(adev, 0); } static void -- cgit v1.2.3