diff options
author | farah kassabri <fkassabri@habana.ai> | 2021-10-24 19:02:32 +0300 |
---|---|---|
committer | Oded Gabbay <ogabbay@kernel.org> | 2021-12-26 09:59:05 +0300 |
commit | 49c052dad691ba1a3dc3559b74e99f2ec2fa0319 (patch) | |
tree | 6499d85f1144eb3786a64e4da547a4102386536f | |
parent | d4194f21400e9b2caef2d48c63ec5ef102eead22 (diff) | |
download | linux-49c052dad691ba1a3dc3559b74e99f2ec2fa0319.tar.xz |
habanalabs: add new opcodes for INFO IOCTL
Add implementation for new opcodes in the INFO IOCTL:
1. Retrieve the replaced DRAM rows from f/w.
2. Retrieve the pending DRAM rows from f/w.
Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
-rw-r--r-- | drivers/misc/habanalabs/common/firmware_if.c | 66 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h | 3 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_ioctl.c | 43 | ||||
-rw-r--r-- | drivers/misc/habanalabs/include/common/cpucp_if.h | 33 | ||||
-rw-r--r-- | include/uapi/misc/habanalabs.h | 4 |
5 files changed, 148 insertions, 1 deletions
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 9addcfba6a8b..70e992bdbde7 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -972,6 +972,72 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power) return rc; } +int hl_fw_dram_replaced_row_get(struct hl_device *hdev, + struct cpucp_hbm_row_info *info) +{ + struct cpucp_hbm_row_info *cpucp_repl_rows_info_cpu_addr; + dma_addr_t cpucp_repl_rows_info_dma_addr; + struct cpucp_packet pkt = {}; + u64 result; + int rc; + + cpucp_repl_rows_info_cpu_addr = + hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, + sizeof(struct cpucp_hbm_row_info), + &cpucp_repl_rows_info_dma_addr); + if (!cpucp_repl_rows_info_cpu_addr) { + dev_err(hdev->dev, + "Failed to allocate DMA memory for CPU-CP replaced rows info packet\n"); + return -ENOMEM; + } + + memset(cpucp_repl_rows_info_cpu_addr, 0, sizeof(struct cpucp_hbm_row_info)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + pkt.addr = cpu_to_le64(cpucp_repl_rows_info_dma_addr); + pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_hbm_row_info)); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), + HL_CPUCP_INFO_TIMEOUT_USEC, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc); + goto out; + } + + memcpy(info, cpucp_repl_rows_info_cpu_addr, sizeof(*info)); + +out: + hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, + sizeof(struct cpucp_hbm_row_info), + cpucp_repl_rows_info_cpu_addr); + + return rc; +} + +int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num) +{ + struct cpucp_packet pkt; + u64 result; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_PENDING_ROWS_STATUS << CPUCP_PKT_CTL_OPCODE_SHIFT); + + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); + if (rc) { + dev_err(hdev->dev, + "Failed to handle CPU-CP pending rows info pkt, error %d\n", rc); + goto out; + } + + *pend_rows_num = (u32) result; +out: + return rc; +} + void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) { struct static_fw_load_mgr *static_loader = diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index fc201537f7a9..a19563c416ac 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -3012,6 +3012,9 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, struct fw_load_mgr *fw_loader, enum comms_cmd cmd, unsigned int size, bool wait_ok, u32 timeout); +int hl_fw_dram_replaced_row_get(struct hl_device *hdev, + struct cpucp_hbm_row_info *info); +int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num); int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], bool is_wc[3]); int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data); diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 19726c6b642a..68c655acdec8 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -503,6 +503,43 @@ static int open_stats_info(struct hl_fpriv *hpriv, struct hl_info_args *args) min((size_t) max_size, sizeof(open_stats_info))) ? -EFAULT : 0; } +static int dram_pending_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + u32 pend_rows_num = 0; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_dram_pending_row_get(hdev, &pend_rows_num); + if (rc) + return rc; + + return copy_to_user(out, &pend_rows_num, + min_t(size_t, max_size, sizeof(pend_rows_num))) ? -EFAULT : 0; +} + +static int dram_replaced_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct cpucp_hbm_row_info info = {0}; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + rc = hl_fw_dram_replaced_row_get(hdev, &info); + if (rc) + return rc; + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -589,6 +626,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_OPEN_STATS: return open_stats_info(hpriv, args); + case HL_INFO_DRAM_REPLACED_ROWS: + return dram_replaced_rows_info(hpriv, args); + + case HL_INFO_DRAM_PENDING_ROWS: + return dram_pending_rows_info(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -ENOTTY; diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h index 17927968e19a..5e19c763f3f0 100644 --- a/drivers/misc/habanalabs/include/common/cpucp_if.h +++ b/drivers/misc/habanalabs/include/common/cpucp_if.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2020 HabanaLabs, Ltd. + * Copyright 2021 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -377,6 +377,13 @@ enum pq_init_status { * a different engine or QMAN according to enum cpucp_idle_mask. * The bit will be 1 if the engine is NOT idle. * + * CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET - + * Fetch all HBM replaced-rows and prending to be replaced rows data. + * + * CPUCP_PACKET_HBM_PENDING_ROWS_STATUS - + * Fetch status of HBM rows pending replacement and need a reboot to + * be replaced. + * * CPUCP_PACKET_POWER_SET - * Resets power history of device to 0 */ @@ -424,6 +431,8 @@ enum cpucp_packet_id { CPUCP_PACKET_NIC_STAT_REGS_CLR, /* internal */ CPUCP_PACKET_NIC_STAT_REGS_ALL_GET, /* internal */ CPUCP_PACKET_IS_IDLE_CHECK, /* internal */ + CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET,/* internal */ + CPUCP_PACKET_HBM_PENDING_ROWS_STATUS, /* internal */ CPUCP_PACKET_POWER_SET, /* internal */ }; @@ -692,6 +701,7 @@ struct eq_generic_event { #define CPUCP_MAX_NIC_LANES (CPUCP_MAX_NICS * CPUCP_LANES_PER_NIC) #define CPUCP_NIC_MASK_ARR_LEN ((CPUCP_MAX_NICS + 63) / 64) #define CPUCP_NIC_POLARITY_ARR_LEN ((CPUCP_MAX_NIC_LANES + 63) / 64) +#define CPUCP_HBM_ROW_REPLACE_MAX 32 struct cpucp_sensor { __le32 type; @@ -837,4 +847,25 @@ struct cpucp_nic_status { __le32 high_ber_cnt; }; +enum cpucp_hbm_row_replace_cause { + REPLACE_CAUSE_DOUBLE_ECC_ERR, + REPLACE_CAUSE_MULTI_SINGLE_ECC_ERR, +}; + +struct cpucp_hbm_row_info { + __u8 hbm_idx; + __u8 pc; + __u8 sid; + __u8 bank_idx; + __le16 row_addr; + __u8 replaced_row_cause; /* enum cpucp_hbm_row_replace_cause */ + __u8 pad; +}; + +struct cpucp_hbm_row_replaced_rows_info { + __le16 num_replaced_rows; + __u8 pad[6]; + struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX]; +}; + #endif /* CPUCP_IF_H */ diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 257b9630773e..9b4d72897061 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -334,6 +334,8 @@ enum hl_server_type { * HL_INFO_TOTAL_ENERGY - Retrieve total energy consumption * HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency * HL_INFO_OPEN_STATS - Retrieve info regarding recent device open calls + * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info + * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -353,6 +355,8 @@ enum hl_server_type { #define HL_INFO_PLL_FREQUENCY 16 #define HL_INFO_POWER 17 #define HL_INFO_OPEN_STATS 18 +#define HL_INFO_DRAM_REPLACED_ROWS 21 +#define HL_INFO_DRAM_PENDING_ROWS 22 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 |