summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Sierra <alex.sierra@amd.com>2021-10-29 21:30:40 +0300
committerAlex Deucher <alexander.deucher@amd.com>2021-11-05 21:10:58 +0300
commita6283010e2907a5576f96b839e1a1c82659f137c (patch)
tree91350b0345f7beb39849c6e8b3d289237eae1216
parent25a1a08fe79be6ef00e1393b1f5545f6ba62919f (diff)
downloadlinux-a6283010e2907a5576f96b839e1a1c82659f137c.tar.xz
drm/amdkfd: avoid recursive lock in migrations back to RAM
[Why]: When we call hmm_range_fault to map memory after a migration, we don't expect memory to be migrated again as a result of hmm_range_fault. The driver ensures that all memory is in GPU-accessible locations so that no migration should be needed. However, there is one corner case where hmm_range_fault can unexpectedly cause a migration from DEVICE_PRIVATE back to system memory due to a write-fault when a system memory page in the same range was mapped read-only (e.g. COW). Ranges with individual pages in different locations are usually the result of failed page migrations (e.g. page lock contention). The unexpected migration back to system memory causes a deadlock from recursive locking in our driver. [How]: Creating a task reference new member under svm_range_list struct. Setting this with "current" reference, right before the hmm_range_fault is called. This member is checked against "current" reference at svm_migrate_to_ram callback function. If equal, the migration will be ignored. Signed-off-by: Alex Sierra <alex.sierra@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.c5
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c2
3 files changed, 8 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index cc1525095937..aeade32ec298 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -861,6 +861,11 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
pr_debug("failed find process at fault address 0x%lx\n", addr);
return VM_FAULT_SIGBUS;
}
+ if (READ_ONCE(p->svms.faulting_task) == current) {
+ pr_debug("skipping ram migration\n");
+ kfd_unref_process(p);
+ return 0;
+ }
addr >>= PAGE_SHIFT;
pr_debug("CPU page fault svms 0x%p address 0x%lx\n", &p->svms, addr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 4104b167e721..98f7503bc106 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -768,6 +768,7 @@ struct svm_range_list {
atomic_t evicted_ranges;
struct delayed_work restore_work;
DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
+ struct task_struct *faulting_task;
};
/* Process data */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index a2000a12bed4..381d8d27a55a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1496,9 +1496,11 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
next = min(vma->vm_end, end);
npages = (next - addr) >> PAGE_SHIFT;
+ WRITE_ONCE(p->svms.faulting_task, current);
r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
addr, npages, &hmm_range,
readonly, true, owner);
+ WRITE_ONCE(p->svms.faulting_task, NULL);
if (r) {
pr_debug("failed %d to get svm range pages\n", r);
goto unreserve_out;