summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStanley.Yang <Stanley.Yang@amd.com>2026-05-11 14:44:16 +0300
committerAlex Deucher <alexander.deucher@amd.com>2026-05-27 17:50:57 +0300
commit82502db448aa9eca2c8d8ee8b424f37aa9ce3ac0 (patch)
treebe870fd9734a48c6ec226f59cb5e2032eb35f2e2
parenta71db9b8f77bab7b730b9f00a769d8f7a9e3dafd (diff)
downloadlinux-82502db448aa9eca2c8d8ee8b424f37aa9ce3ac0.tar.xz
drm/amd/ras: cap pending_ecc_list size
Drop new entries once pending_ecc_count hits RAS_UMC_PENDING_ECC_MAX (8192) so an ECC storm or repeated UMC error injection cannot exhaust kernel memory. Dropped events are counted and reported via a rate-limited warning. Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h9
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_umc.c35
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_umc.h12
3 files changed, 56 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
index 8156531a7b63..f34dda7ce87b 100644
--- a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
@@ -46,6 +46,15 @@
printk(KERN_WARNING fmt, ##__VA_ARGS__); \
} while (0)
+#define RAS_DEV_WARN_RATELIMITED(device, fmt, ...) \
+ do { \
+ if (device) \
+ dev_warn_ratelimited(((struct amdgpu_device *)device)->dev, \
+ fmt, ##__VA_ARGS__); \
+ else \
+ printk_ratelimited(KERN_WARNING fmt, ##__VA_ARGS__); \
+ } while (0)
+
#define RAS_DEV_INFO(device, fmt, ...) \
do { \
if (device) \
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
index 91dd730de3ce..f32ee2fecf53 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
@@ -193,12 +193,29 @@ static void ras_umc_reserve_eeprom_record(struct ras_core_context *ras_core,
}
/* When gpu reset is ongoing, ecc logging operations will be pended.
+ *
+ * The pending list is bounded by RAS_UMC_PENDING_ECC_MAX so that an ECC
+ * storm or repeated UMC error injection cannot make this list (and the
+ * kernel allocations behind it) grow without bound. Once the limit is
+ * reached, additional events are dropped and counted in
+ * pending_ecc_dropped, with a rate-limited warning emitted.
*/
int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank)
{
struct ras_umc *ras_umc = &ras_core->ras_umc;
struct ras_bank_ecc_node *ecc_node;
+ mutex_lock(&ras_umc->pending_ecc_lock);
+ if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) {
+ ras_umc->pending_ecc_dropped++;
+ mutex_unlock(&ras_umc->pending_ecc_lock);
+ RAS_DEV_WARN_RATELIMITED(ras_core->dev,
+ "pending ECC list full (%u), dropping bad bank event (total dropped:%u)\n",
+ RAS_UMC_PENDING_ECC_MAX, ras_umc->pending_ecc_dropped);
+ return -ENOSPC;
+ }
+ mutex_unlock(&ras_umc->pending_ecc_lock);
+
ecc_node = kzalloc_obj(*ecc_node);
if (!ecc_node)
return -ENOMEM;
@@ -206,7 +223,15 @@ int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_b
memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc));
mutex_lock(&ras_umc->pending_ecc_lock);
+ /* re-check under the lock to honor the cap across concurrent callers */
+ if (ras_umc->pending_ecc_count >= RAS_UMC_PENDING_ECC_MAX) {
+ ras_umc->pending_ecc_dropped++;
+ mutex_unlock(&ras_umc->pending_ecc_lock);
+ kfree(ecc_node);
+ return -ENOSPC;
+ }
list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list);
+ ras_umc->pending_ecc_count++;
mutex_unlock(&ras_umc->pending_ecc_lock);
return 0;
@@ -225,8 +250,16 @@ int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core)
if (!ras_umc_log_bad_bank(ras_core, &ecc_node->ecc)) {
list_del(&ecc_node->node);
kfree(ecc_node);
+ if (ras_umc->pending_ecc_count)
+ ras_umc->pending_ecc_count--;
}
}
+ if (ras_umc->pending_ecc_dropped) {
+ RAS_DEV_WARN(ras_core->dev,
+ "%u pending ECC bad-bank events were dropped during GPU reset\n",
+ ras_umc->pending_ecc_dropped);
+ ras_umc->pending_ecc_dropped = 0;
+ }
mutex_unlock(&ras_umc->pending_ecc_lock);
return 0;
@@ -609,6 +642,8 @@ int ras_umc_sw_fini(struct ras_core_context *ras_core)
list_del(&ecc_node->node);
kfree(ecc_node);
}
+ ras_umc->pending_ecc_count = 0;
+ ras_umc->pending_ecc_dropped = 0;
mutex_unlock(&ras_umc->pending_ecc_lock);
mutex_destroy(&ras_umc->tree_lock);
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
index 1d3026be509b..237525b46b9b 100644
--- a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
@@ -139,8 +139,20 @@ struct ras_umc {
struct mutex pending_ecc_lock;
struct ras_umc_err_data umc_err_data;
struct list_head pending_ecc_list;
+ /* number of entries currently queued on pending_ecc_list */
+ u32 pending_ecc_count;
+ /* number of entries dropped because pending_ecc_list was full */
+ u32 pending_ecc_dropped;
};
+/*
+ * Upper bound on entries that can be queued on pending_ecc_list while a
+ * GPU reset is in progress. Beyond this, new ECC events are dropped to
+ * prevent unbounded kernel memory growth in case of an ECC storm or
+ * malicious/repeated UMC error injection.
+ */
+#define RAS_UMC_PENDING_ECC_MAX 8192
+
int ras_umc_sw_init(struct ras_core_context *ras);
int ras_umc_sw_fini(struct ras_core_context *ras);
int ras_umc_hw_init(struct ras_core_context *ras);