diff options
author | Eric Huang <JinhuiEric.Huang@amd.com> | 2019-01-11 22:38:51 +0300 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2019-03-19 23:36:51 +0300 |
commit | 9b54d2017687df9fa827faf9e4022973b87fc0ff (patch) | |
tree | bbe1df5476b9ba2c9292279a0fa23c75262e2d04 /drivers/gpu/drm/amd/amdkfd/kfd_events.c | |
parent | 0dee45a25a63f7ded9f0f0bec41fe40e397aa438 (diff) | |
download | linux-9b54d2017687df9fa827faf9e4022973b87fc0ff.tar.xz |
drm/amdkfd: add RAS ECC event support (v3)
RAS ECC event will combine with GPU reset event, due to
ECC interrupts are caused by uncorrectable error that triggers
GPU reset.
v2: Fix misleading-indentation warning
v3: fix build with CONFIG_HSA_AMD disabled
Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_events.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_events.c | 18 |
1 files changed, 17 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index e9f0e0a1b41c..6e1d41c5bf86 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, void kfd_signal_reset_event(struct kfd_dev *dev) { struct kfd_hsa_hw_exception_data hw_exception_data; + struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_process *p; struct kfd_event *ev; unsigned int temp; uint32_t id, idx; + int reset_cause = atomic_read(&dev->sram_ecc_flag) ? + KFD_HW_EXCEPTION_ECC : + KFD_HW_EXCEPTION_GPU_HANG; /* Whole gpu reset caused by GPU hang and memory is lost */ memset(&hw_exception_data, 0, sizeof(hw_exception_data)); hw_exception_data.gpu_id = dev->id; hw_exception_data.memory_lost = 1; + hw_exception_data.reset_cause = reset_cause; + + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); + memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC; + memory_exception_data.gpu_id = dev->id; + memory_exception_data.failure.imprecise = true; idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { mutex_lock(&p->event_mutex); id = KFD_FIRST_NONSIGNAL_EVENT_ID; - idr_for_each_entry_continue(&p->event_idr, ev, id) + idr_for_each_entry_continue(&p->event_idr, ev, id) { if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { ev->hw_exception_data = hw_exception_data; set_event(ev); } + if (ev->type == KFD_EVENT_TYPE_MEMORY && + reset_cause == KFD_HW_EXCEPTION_ECC) { + ev->memory_exception_data = memory_exception_data; + set_event(ev); + } + } mutex_unlock(&p->event_mutex); } srcu_read_unlock(&kfd_processes_srcu, idx); |