summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2021-06-03 23:13:56 +0300
committerDave Airlie <airlied@redhat.com>2021-06-03 23:13:57 +0300
commit5745d647d5563d3e9d32013ad4e5c629acff04d7 (patch)
tree8ce79933def43fcb34b675242304015457f25ef5 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parentccd1950c2f7e38ae45aeefb99a08b39407cd6c63 (diff)
parent7d9c70d23550eb86a1bec1954ccaa8d6ec3a3328 (diff)
downloadlinux-5745d647d5563d3e9d32013ad4e5c629acff04d7.tar.xz
Merge tag 'amd-drm-next-5.14-2021-06-02' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-5.14-2021-06-02: amdgpu: - GC/MM register access macro clean up for SR-IOV - Beige Goby updates - W=1 Fixes - Aldebaran fixes - Misc display fixes - ACPI ATCS/ATIF handling rework - SR-IOV fixes - RAS fixes - 16bpc fixed point format support - Initial smartshift support - RV/PCO power tuning fixes for suspend/resume - More buffer object subclassing work - Add new INFO query for additional vbios information - Add new placement for preemptable SG buffers amdkfd: - Misc fixes radeon: - W=1 Fixes - Misc cleanups UAPI: - Add new INFO query for additional vbios information Useful for debugging vbios related issues. Proposed umr patch: https://patchwork.freedesktop.org/patch/433297/ - 16bpc fixed point format support IGT test: https://lists.freedesktop.org/archives/igt-dev/2021-May/031507.html Proposed Vulkan patch: https://github.com/kleinerm/pal/commit/a25d4802074b13a8d5f7edc96ae45469ecbac3c4 - Add a new GEM flag which is only used internally in the kernel driver. Userspace is not allowed to set it. drm: - 16bpc fixed point format fourcc Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20210602214009.4553-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c63
1 files changed, 55 insertions, 8 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c2c791ca00f4..9dfc1ebb41a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -27,6 +27,7 @@
#include <linux/uaccess.h>
#include <linux/reboot.h>
#include <linux/syscalls.h>
+#include <linux/pm_runtime.h>
#include "amdgpu.h"
#include "amdgpu_ras.h"
@@ -1043,29 +1044,36 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
}
/* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
- bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
- struct ras_err_data data = {0, 0};
+ unsigned long ce, ue;
if (!adev->ras_enabled || !con)
- return 0;
+ return;
+ ce = 0;
+ ue = 0;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
.head = obj->head,
};
if (amdgpu_ras_query_error_status(adev, &info))
- return 0;
+ return;
- data.ce_count += info.ce_count;
- data.ue_count += info.ue_count;
+ ce += info.ce_count;
+ ue += info.ue_count;
}
- return is_ce ? data.ce_count : data.ue_count;
+ if (ce_count)
+ *ce_count = ce;
+
+ if (ue_count)
+ *ue_count = ue;
}
/* query/inject/cure end */
@@ -2109,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
adev->ras_hw_enabled & amdgpu_ras_mask;
}
+static void amdgpu_ras_counte_dw(struct work_struct *work)
+{
+ struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
+ ras_counte_delay_work.work);
+ struct amdgpu_device *adev = con->adev;
+ struct drm_device *dev = &adev->ddev;
+ unsigned long ce_count, ue_count;
+ int res;
+
+ res = pm_runtime_get_sync(dev->dev);
+ if (res < 0)
+ goto Out;
+
+ /* Cache new values.
+ */
+ amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
+ atomic_set(&con->ras_ce_count, ce_count);
+ atomic_set(&con->ras_ue_count, ue_count);
+
+ pm_runtime_mark_last_busy(dev->dev);
+Out:
+ pm_runtime_put_autosuspend(dev->dev);
+}
+
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -2123,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (!con)
return -ENOMEM;
+ con->adev = adev;
+ INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
+ atomic_set(&con->ras_ce_count, 0);
+ atomic_set(&con->ras_ue_count, 0);
+
con->objs = (struct ras_manager *)(con + 1);
amdgpu_ras_set_context(adev, con);
@@ -2226,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
struct ras_fs_if *fs_info,
struct ras_ih_if *ih_info)
{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ unsigned long ue_count, ce_count;
int r;
/* disable RAS feature per IP block if it is not supported */
@@ -2266,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
if (r)
goto sysfs;
+ /* Those are the cached values at init.
+ */
+ amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
+ atomic_set(&con->ras_ce_count, ce_count);
+ atomic_set(&con->ras_ue_count, ue_count);
+
return 0;
cleanup:
amdgpu_ras_sysfs_remove(adev, ras_block);
@@ -2384,6 +2429,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
if (con->features)
amdgpu_ras_disable_all_features(adev, 1);
+ cancel_delayed_work_sync(&con->ras_counte_delay_work);
+
amdgpu_ras_set_context(adev, NULL);
kfree(con);