From 663b0f1e141dc60ce6c09ae6afc5f213b22d13ca Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 16 Feb 2024 11:00:10 -0500 Subject: drm/amdkfd: Document and define SVM events message macro Document how to use SMI system management interface to enable and receive SVM events. Document SVM event triggers. Define SVM events message string format macro that could be used by user mode for sscanf to parse the event. Add it to uAPI header file to make it obvious that is changing uAPI in future. No functional changes. Signed-off-by: Philip Yang Reviewed-by: James Zhu Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 100 +++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 71a7ce5f2d4c..717307d6b5b7 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -540,26 +540,29 @@ enum kfd_smi_event { KFD_SMI_EVENT_ALL_PROCESS = 64 }; +/* The reason of the page migration event */ enum KFD_MIGRATE_TRIGGERS { - KFD_MIGRATE_TRIGGER_PREFETCH, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, - KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, - KFD_MIGRATE_TRIGGER_TTM_EVICTION + KFD_MIGRATE_TRIGGER_PREFETCH, /* Prefetch to GPU VRAM or system memory */ + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, /* GPU page fault recover */ + KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, /* CPU page fault recover */ + KFD_MIGRATE_TRIGGER_TTM_EVICTION /* TTM eviction */ }; +/* The reason of user queue evition event */ enum KFD_QUEUE_EVICTION_TRIGGERS { - KFD_QUEUE_EVICTION_TRIGGER_SVM, - KFD_QUEUE_EVICTION_TRIGGER_USERPTR, - KFD_QUEUE_EVICTION_TRIGGER_TTM, - KFD_QUEUE_EVICTION_TRIGGER_SUSPEND, - KFD_QUEUE_EVICTION_CRIU_CHECKPOINT, - KFD_QUEUE_EVICTION_CRIU_RESTORE + KFD_QUEUE_EVICTION_TRIGGER_SVM, /* SVM buffer migration */ + KFD_QUEUE_EVICTION_TRIGGER_USERPTR, /* userptr movement */ + KFD_QUEUE_EVICTION_TRIGGER_TTM, /* TTM move buffer */ + KFD_QUEUE_EVICTION_TRIGGER_SUSPEND, /* GPU suspend */ + KFD_QUEUE_EVICTION_CRIU_CHECKPOINT, /* CRIU checkpoint */ + KFD_QUEUE_EVICTION_CRIU_RESTORE /* CRIU restore */ }; +/* The reason of unmap buffer from GPU event */ enum KFD_SVM_UNMAP_TRIGGERS { - KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY, - KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE, - KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU + KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY, /* MMU notifier CPU buffer movement */ + KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,/* MMU notifier page migration */ + KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU /* Unmap to free the buffer */ }; #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) @@ -570,6 +573,77 @@ struct kfd_ioctl_smi_events_args { __u32 anon_fd; /* from KFD */ }; +/* + * SVM event tracing via SMI system management interface + * + * Open event file descriptor + * use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return a anonymous file + * descriptor to receive SMI events. + * If calling with sudo permission, then file descriptor can be used to receive + * SVM events from all processes, otherwise, to only receive SVM events of same + * process. + * + * To enable the SVM event + * Write event file descriptor with KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap + * mask to start record the event to the kfifo, use bitmap mask combination + * for multiple events. New event mask will overwrite the previous event mask. + * KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS) bit requires sudo + * permisson to receive SVM events from all process. + * + * To receive the event + * Application can poll file descriptor to wait for the events, then read event + * from the file into a buffer. Each event is one line string message, starting + * with the event id, then the event specific information. + * + * To decode event information + * The following event format string macro can be used with sscanf to decode + * the specific event information. + * event triggers: the reason to generate the event, defined as enum for unmap, + * eviction and migrate events. + * node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0 for system memory. + * addr: user mode address, in pages + * size: in pages + * pid: the process ID to generate the event + * ns: timestamp in nanosecond-resolution, starts at system boot time but + * stops during suspend + * migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update + * rw: 'W' for write page fault, 'R' for read page fault + * rescheduled: 'R' if the queue restore failed and rescheduled to try again + */ +#define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\ + "%x %s\n", (reset_seq_num), (reset_cause) + +#define KFD_EVENT_FMT_THERMAL_THROTTLING(bitmask, counter)\ + "%llx:%llx\n", (bitmask), (counter) + +#define KFD_EVENT_FMT_VMFAULT(pid, task_name)\ + "%x:%s\n", (pid), (task_name) + +#define KFD_EVENT_FMT_PAGEFAULT_START(ns, pid, addr, node, rw)\ + "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (rw) + +#define KFD_EVENT_FMT_PAGEFAULT_END(ns, pid, addr, node, migrate_update)\ + "%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (migrate_update) + +#define KFD_EVENT_FMT_MIGRATE_START(ns, pid, start, size, from, to, prefetch_loc,\ + preferred_loc, migrate_trigger)\ + "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\ + (from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger) + +#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger)\ + "%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\ + (from), (to), (migrate_trigger) + +#define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node, evict_trigger)\ + "%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger) + +#define KFD_EVENT_FMT_QUEUE_RESTORE(ns, pid, node, rescheduled)\ + "%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled) + +#define KFD_EVENT_FMT_UNMAP_FROM_GPU(ns, pid, addr, size, node, unmap_trigger)\ + "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\ + (node), (unmap_trigger) + /************************************************************************************************** * CRIU IOCTLs (Checkpoint Restore In Userspace) * -- cgit v1.2.3 From b2d4da31a1f40b05a61076efd4c79b88439003b7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 2 Aug 2024 09:56:28 -0400 Subject: drm: new helper: drm_gem_prime_handle_to_dmabuf() Once something had been put into descriptor table, the only thing you can do with it is returning descriptor to userland - you can't withdraw it on subsequent failure exit, etc. You certainly can't count upon it staying in the same slot of descriptor table - another thread could've played with close(2)/dup2(2)/whatnot. drm_gem_prime_handle_to_fd() creates a dmabuf, allocates a descriptor and attaches dmabuf's file to it (the last two steps are done in dma_buf_fd()). That's nice when all you are going to do is passing a descriptor to userland. If you just need to work with the resulting object or have something else to be done that might fail, drm_gem_prime_handle_to_fd() is racy. The problem is analogous to one with anon_inode_getfd(), and solution is similar to what anon_inode_getfile() provides. Add drm_gem_prime_handle_to_dmabuf() - the "set dmabuf up" parts of drm_gem_prime_handle_to_fd() without the descriptor-related ones. Instead of inserting into descriptor table and returning the file descriptor it just returns the struct file. drm_gem_prime_handle_to_fd() becomes a wrapper for it. Other users will be introduced in the next commit. Acked-by: Thomas Zimmermann Signed-off-by: Al Viro Signed-off-by: Alex Deucher --- drivers/gpu/drm/drm_prime.c | 84 +++++++++++++++++++++++++++++---------------- include/drm/drm_prime.h | 3 ++ 2 files changed, 57 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c index 03bd3c7bd0dc..0e3f8adf162f 100644 --- a/drivers/gpu/drm/drm_prime.c +++ b/drivers/gpu/drm/drm_prime.c @@ -410,22 +410,30 @@ static struct dma_buf *export_and_register_object(struct drm_device *dev, } /** - * drm_gem_prime_handle_to_fd - PRIME export function for GEM drivers + * drm_gem_prime_handle_to_dmabuf - PRIME export function for GEM drivers * @dev: dev to export the buffer from * @file_priv: drm file-private structure * @handle: buffer handle to export * @flags: flags like DRM_CLOEXEC - * @prime_fd: pointer to storage for the fd id of the create dma-buf * * This is the PRIME export function which must be used mandatorily by GEM * drivers to ensure correct lifetime management of the underlying GEM object. * The actual exporting from GEM object to a dma-buf is done through the * &drm_gem_object_funcs.export callback. + * + * Unlike drm_gem_prime_handle_to_fd(), it returns the struct dma_buf it + * has created, without attaching it to any file descriptors. The difference + * between those two is similar to that between anon_inode_getfile() and + * anon_inode_getfd(); insertion into descriptor table is something you + * can not revert if any cleanup is needed, so the descriptor-returning + * variants should only be used when you are past the last failure exit + * and the only thing left is passing the new file descriptor to userland. + * When all you need is the object itself or when you need to do something + * else that might fail, use that one instead. */ -int drm_gem_prime_handle_to_fd(struct drm_device *dev, +struct dma_buf *drm_gem_prime_handle_to_dmabuf(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, - uint32_t flags, - int *prime_fd) + uint32_t flags) { struct drm_gem_object *obj; int ret = 0; @@ -434,14 +442,14 @@ int drm_gem_prime_handle_to_fd(struct drm_device *dev, mutex_lock(&file_priv->prime.lock); obj = drm_gem_object_lookup(file_priv, handle); if (!obj) { - ret = -ENOENT; + dmabuf = ERR_PTR(-ENOENT); goto out_unlock; } dmabuf = drm_prime_lookup_buf_by_handle(&file_priv->prime, handle); if (dmabuf) { get_dma_buf(dmabuf); - goto out_have_handle; + goto out; } mutex_lock(&dev->object_name_lock); @@ -463,7 +471,6 @@ int drm_gem_prime_handle_to_fd(struct drm_device *dev, /* normally the created dma-buf takes ownership of the ref, * but if that fails then drop the ref */ - ret = PTR_ERR(dmabuf); mutex_unlock(&dev->object_name_lock); goto out; } @@ -478,34 +485,51 @@ out_have_obj: ret = drm_prime_add_buf_handle(&file_priv->prime, dmabuf, handle); mutex_unlock(&dev->object_name_lock); - if (ret) - goto fail_put_dmabuf; - -out_have_handle: - ret = dma_buf_fd(dmabuf, flags); - /* - * We must _not_ remove the buffer from the handle cache since the newly - * created dma buf is already linked in the global obj->dma_buf pointer, - * and that is invariant as long as a userspace gem handle exists. - * Closing the handle will clean out the cache anyway, so we don't leak. - */ - if (ret < 0) { - goto fail_put_dmabuf; - } else { - *prime_fd = ret; - ret = 0; + if (ret) { + dma_buf_put(dmabuf); + dmabuf = ERR_PTR(ret); } - - goto out; - -fail_put_dmabuf: - dma_buf_put(dmabuf); out: drm_gem_object_put(obj); out_unlock: mutex_unlock(&file_priv->prime.lock); + return dmabuf; +} +EXPORT_SYMBOL(drm_gem_prime_handle_to_dmabuf); - return ret; +/** + * drm_gem_prime_handle_to_fd - PRIME export function for GEM drivers + * @dev: dev to export the buffer from + * @file_priv: drm file-private structure + * @handle: buffer handle to export + * @flags: flags like DRM_CLOEXEC + * @prime_fd: pointer to storage for the fd id of the create dma-buf + * + * This is the PRIME export function which must be used mandatorily by GEM + * drivers to ensure correct lifetime management of the underlying GEM object. + * The actual exporting from GEM object to a dma-buf is done through the + * &drm_gem_object_funcs.export callback. + */ +int drm_gem_prime_handle_to_fd(struct drm_device *dev, + struct drm_file *file_priv, uint32_t handle, + uint32_t flags, + int *prime_fd) +{ + struct dma_buf *dmabuf; + int fd = get_unused_fd_flags(flags); + + if (fd < 0) + return fd; + + dmabuf = drm_gem_prime_handle_to_dmabuf(dev, file_priv, handle, flags); + if (IS_ERR(dmabuf)) { + put_unused_fd(fd); + return PTR_ERR(dmabuf); + } + + fd_install(fd, dmabuf->file); + *prime_fd = fd; + return 0; } EXPORT_SYMBOL(drm_gem_prime_handle_to_fd); diff --git a/include/drm/drm_prime.h b/include/drm/drm_prime.h index 2a1d01e5b56b..fa085c44d4ca 100644 --- a/include/drm/drm_prime.h +++ b/include/drm/drm_prime.h @@ -69,6 +69,9 @@ void drm_gem_dmabuf_release(struct dma_buf *dma_buf); int drm_gem_prime_fd_to_handle(struct drm_device *dev, struct drm_file *file_priv, int prime_fd, uint32_t *handle); +struct dma_buf *drm_gem_prime_handle_to_dmabuf(struct drm_device *dev, + struct drm_file *file_priv, uint32_t handle, + uint32_t flags); int drm_gem_prime_handle_to_fd(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, uint32_t flags, int *prime_fd); -- cgit v1.2.3