diff options
author | Tal Cohen <talcohen@habana.ai> | 2022-04-28 13:45:18 +0300 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2022-05-22 22:01:20 +0300 |
commit | 422ef171038d4855ffe938137039a8f3b3e84293 (patch) | |
tree | a061758a6048e6ed4eda70d954dd8a8a34293b6f | |
parent | f2daa2d97ec16766bc36fc2b6218d0acb47885ed (diff) | |
download | linux-422ef171038d4855ffe938137039a8f3b3e84293.tar.xz |
habanalabs: add support for notification via eventfd
The driver will be able to send notification events towards
a user process, using user's registered event file descriptor.
The driver uses the notification mechanism to inform the
user about an occurred event.
A user thread can wait until a notification is received from
the driver.
The driver stores the occurred event until the user reads it,
using HL_INFO_GET_EVENTS - new ioctl opcode in the INFO ioctl.
Gaudi specific implementation includes sending a notification
on a TPC assertion event that is received from f/w.
Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r-- | drivers/misc/habanalabs/common/device.c | 52 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h | 40 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_drv.c | 9 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_ioctl.c | 65 | ||||
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudi.c | 14 | ||||
-rw-r--r-- | include/uapi/misc/habanalabs.h | 15 |
6 files changed, 182 insertions, 13 deletions
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 15df89b31e1b..315510aaca35 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -285,6 +285,14 @@ static void hpriv_release(struct kref *ref) hdev->compute_ctx_in_release = 0; + /* release the eventfd */ + if (hpriv->notifier_event.eventfd) { + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + } + + mutex_destroy(&hpriv->notifier_event.lock); + kfree(hpriv); } @@ -355,6 +363,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp) list_del(&hpriv->dev_node); mutex_unlock(&hdev->fpriv_ctrl_list_lock); out: + /* release the eventfd */ + if (hpriv->notifier_event.eventfd) { + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + } + + mutex_destroy(&hpriv->notifier_event.lock); put_pid(hpriv->taskpid); kfree(hpriv); @@ -1506,6 +1521,43 @@ out_err: return rc; } +static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event) +{ + mutex_lock(¬ifier_event->lock); + notifier_event->events_mask |= event; + if (notifier_event->eventfd) + eventfd_signal(notifier_event->eventfd, 1); + + mutex_unlock(¬ifier_event->lock); +} + +/* + * hl_notifier_event_send_all - notify all user processes via eventfd + * + * @hdev: pointer to habanalabs device structure + * @event: the occurred event + * Returns 0 for success or an error on failure. + */ +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event) +{ + struct hl_fpriv *hpriv; + + mutex_lock(&hdev->fpriv_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) + hl_notifier_event_send(&hpriv->notifier_event, event); + + mutex_unlock(&hdev->fpriv_list_lock); + + /* control device */ + mutex_lock(&hdev->fpriv_ctrl_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node) + hl_notifier_event_send(&hpriv->notifier_event, event); + + mutex_unlock(&hdev->fpriv_ctrl_list_lock); +} + /* * hl_device_init - main initialization function for habanalabs device * diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 918e8a04acab..8977ec67dba7 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -21,6 +21,7 @@ #include <linux/hashtable.h> #include <linux/debugfs.h> #include <linux/rwsem.h> +#include <linux/eventfd.h> #include <linux/bitfield.h> #include <linux/genalloc.h> #include <linux/sched/signal.h> @@ -1932,6 +1933,18 @@ struct hl_debug_params { bool enable; }; +/** + * struct hl_notifier_event - holds the notifier data structure + * @eventfd: the event file descriptor to raise the notifications + * @lock: mutex lock to protect the notifier data flows + * @events_mask: indicates the bitmap events + */ +struct hl_notifier_event { + struct eventfd_ctx *eventfd; + struct mutex lock; + u64 events_mask; +}; + /* * FILE PRIVATE STRUCTURE */ @@ -1943,24 +1956,25 @@ struct hl_debug_params { * @taskpid: current process ID. * @ctx: current executing context. TODO: remove for multiple ctx per process * @ctx_mgr: context manager to handle multiple context for this FD. - * @cb_mgr: command buffer manager to handle multiple buffers for this FD. * @mem_mgr: manager descriptor for memory exportable via mmap + * @notifier_event: notifier eventfd towards user process * @debugfs_list: list of relevant ASIC debugfs. * @dev_node: node in the device list of file private data * @refcount: number of related contexts. * @restore_phase_mutex: lock for context switch and restore phase. */ struct hl_fpriv { - struct hl_device *hdev; - struct file *filp; - struct pid *taskpid; - struct hl_ctx *ctx; - struct hl_ctx_mgr ctx_mgr; - struct hl_mem_mgr mem_mgr; - struct list_head debugfs_list; - struct list_head dev_node; - struct kref refcount; - struct mutex restore_phase_mutex; + struct hl_device *hdev; + struct file *filp; + struct pid *taskpid; + struct hl_ctx *ctx; + struct hl_ctx_mgr ctx_mgr; + struct hl_mem_mgr mem_mgr; + struct hl_notifier_event notifier_event; + struct list_head debugfs_list; + struct list_head dev_node; + struct kref refcount; + struct mutex restore_phase_mutex; }; @@ -2676,8 +2690,8 @@ struct hl_reset_info { * @state_dump_specs: constants and dictionaries needed to dump system state. * @multi_cs_completion: array of multi-CS completion. * @clk_throttling: holds information about current/previous clock throttling events - * @reset_info: holds current device reset information. * @last_error: holds information about last session in which CS timeout or razwi error occurred. + * @reset_info: holds current device reset information. * @stream_master_qid_arr: pointer to array with QIDs of master streams. * @fw_major_version: major version of current loaded preboot * @dram_used_mem: current DRAM memory consumption. @@ -3071,6 +3085,8 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization); int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sensors_arr); +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event); + int hl_sysfs_init(struct hl_device *hdev); void hl_sysfs_fini(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 1210de39d661..c97173e9507d 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -134,6 +134,10 @@ int hl_device_open(struct inode *inode, struct file *filp) hpriv->hdev = hdev; filp->private_data = hpriv; hpriv->filp = filp; + hpriv->notifier_event.events_mask = 0; + hpriv->notifier_event.eventfd = 0; + + mutex_init(&hpriv->notifier_event.lock); mutex_init(&hpriv->restore_phase_mutex); kref_init(&hpriv->refcount); nonseekable_open(inode, filp); @@ -208,6 +212,7 @@ out_err: hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); filp->private_data = NULL; mutex_destroy(&hpriv->restore_phase_mutex); + mutex_destroy(&hpriv->notifier_event.lock); put_pid(hpriv->taskpid); kfree(hpriv); @@ -241,6 +246,10 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) hpriv->hdev = hdev; filp->private_data = hpriv; hpriv->filp = filp; + hpriv->notifier_event.events_mask = 0; + hpriv->notifier_event.eventfd = 0; + + mutex_init(&hpriv->notifier_event.lock); nonseekable_open(inode, filp); hpriv->taskpid = get_task_pid(current, PIDTYPE_PID); diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index bfb5cfe68110..d1ef56a8d3ac 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -116,6 +116,25 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate, return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0; } +static int events_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + int rc; + u32 max_size = args->return_size; + u64 events_mask; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((max_size < sizeof(u64)) || (!out)) + return -EINVAL; + + mutex_lock(&hpriv->notifier_event.lock); + events_mask = hpriv->notifier_event.events_mask; + hpriv->notifier_event.events_mask = 0; + mutex_unlock(&hpriv->notifier_event.lock); + + rc = copy_to_user(out, &events_mask, sizeof(u64)); + return rc; +} + static int dram_usage_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { struct hl_device *hdev = hpriv->hdev; @@ -614,6 +633,43 @@ static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_ return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; } +static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + int rc; + + /* check if there is already a registered on that process */ + mutex_lock(&hpriv->notifier_event.lock); + if (hpriv->notifier_event.eventfd) { + mutex_unlock(&hpriv->notifier_event.lock); + return -EINVAL; + } + + hpriv->notifier_event.eventfd = eventfd_ctx_fdget(args->eventfd); + if (IS_ERR(hpriv->notifier_event.eventfd)) { + rc = PTR_ERR(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + mutex_unlock(&hpriv->notifier_event.lock); + return rc; + } + + mutex_unlock(&hpriv->notifier_event.lock); + return 0; +} + +static int eventfd_unregister(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + mutex_lock(&hpriv->notifier_event.lock); + if (!hpriv->notifier_event.eventfd) { + mutex_unlock(&hpriv->notifier_event.lock); + return -EINVAL; + } + + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + mutex_unlock(&hpriv->notifier_event.lock); + return 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -667,6 +723,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES: return dev_mem_alloc_page_sizes_info(hpriv, args); + case HL_INFO_GET_EVENTS: + return events_info(hpriv, args); + default: break; } @@ -717,6 +776,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_DRAM_PENDING_ROWS: return dram_pending_rows_info(hpriv, args); + case HL_INFO_REGISTER_EVENTFD: + return eventfd_register(hpriv, args); + + case HL_INFO_UNREGISTER_EVENTFD: + return eventfd_unregister(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -EINVAL; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 08cd60300b4f..1c388537de33 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7879,7 +7879,6 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_MMU_PAGE_FAULT: case GAUDI_EVENT_MMU_WR_PERM: case GAUDI_EVENT_RAZWI_OR_ADC: - case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM: case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM: fallthrough; @@ -7899,6 +7898,19 @@ static void gaudi_handle_eqe(struct hl_device *hdev, hl_fw_unmask_irq(hdev, event_type); break; + case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_qman_err(hdev, event_type); + hl_fw_unmask_irq(hdev, event_type); + + /* In TPC QM event, notify on TPC assertion. While there isn't + * a specific event for assertion yet, the FW generates QM event. + * The SW upper layer will inspect an internal mapped area to indicate + * if the event is a tpc assertion or tpc QM. + */ + hl_notifier_event_send_all(hdev, HL_NOTIFIER_EVENT_TPC_ASSERT); + break; + case GAUDI_EVENT_RAZWI_OR_ADC_SW: gaudi_print_irq_info(hdev, event_type, true); goto reset_device; diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 3576bf2b4841..52540d5b4fc9 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -349,6 +349,9 @@ enum hl_server_type { * Razwi initiator. * Razwi cause, was it a page fault or MMU access error. * HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES - Retrieve valid page sizes for device memory allocation + * HL_INFO_REGISTER_EVENTFD - Register eventfd for event notifications. + * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd + * HL_INFO_GET_EVENTS - Retrieve the last occurred events */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -374,6 +377,9 @@ enum hl_server_type { #define HL_INFO_CS_TIMEOUT_EVENT 24 #define HL_INFO_RAZWI_EVENT 25 #define HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES 26 +#define HL_INFO_REGISTER_EVENTFD 28 +#define HL_INFO_UNREGISTER_EVENTFD 29 +#define HL_INFO_GET_EVENTS 30 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -679,6 +685,7 @@ enum gaudi_dcores { * @period_ms: Period value, in milliseconds, for utilization rate in range 100ms - 1000ms in 100 ms * resolution. Currently not in use. * @pll_index: Index as defined in hl_<asic type>_pll_index enumeration. + * @eventfd: event file descriptor for event notifications. * @pad: Padding to 64 bit. */ struct hl_info_args { @@ -691,6 +698,7 @@ struct hl_info_args { __u32 ctx_id; __u32 period_ms; __u32 pll_index; + __u32 eventfd; }; __u32 pad; @@ -1391,6 +1399,13 @@ struct hl_debug_args { }; /* + * Notifier event values - for the notification mechanism and the HL_INFO_GET_EVENTS command + * + * HL_NOTIFIER_EVENT_TPC_ASSERT - Indicates TPC assert event + */ +#define HL_NOTIFIER_EVENT_TPC_ASSERT (1 << 0) + +/* * Various information operations such as: * - H/W IP information * - Current dram usage |