/* * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __FIRMWARE_DLA_INTERFACE_H_ #define __FIRMWARE_DLA_INTERFACE_H_ #include /** * @ingroup Processors * @name DLA Processors * Processor modules in DLA engine. Each processor has it's * own operation a.k.a. HW layer. Network is formed using * graph of these operations * @{ */ #define DLA_OP_BDMA 0 #define DLA_OP_CONV 1 #define DLA_OP_SDP 2 #define DLA_OP_PDP 3 #define DLA_OP_CDP 4 #define DLA_OP_RUBIK 5 /** @} */ /** * @ingroup Processors * @name Maximum number of processors * @brief DLA ash 6 processors * @{ */ #define DLA_OP_NUM 6 /** @} */ /** * @ingroup Processors * @name Number of groups * @brief Each processor has 2 groups of registers * @{ */ #define DLA_NUM_GROUPS 2 /** @} */ /** * Network descriptor * * Contains all information to execute a network * * @op_head: Index of first operation of each type in operations list * @num_rois: Number of ROIs * @num_operations: Number of operations in one list * @num_luts: Number of LUTs */ struct dla_network_desc { int16_t operation_desc_index; int16_t surface_desc_index; int16_t dependency_graph_index; int16_t lut_data_index; int16_t roi_array_index; int16_t surface_index; int16_t stat_list_index; int16_t reserved1; int16_t op_head[DLA_OP_NUM]; uint16_t num_rois; uint16_t num_operations; uint16_t num_luts; uint16_t num_addresses; int16_t input_layer; uint8_t dynamic_roi; uint8_t reserved0; } __packed __aligned(4); /** * @name Memory types * @brief DLA engnine can read/write to/from 3 memory types * @{ */ #define DLA_MEM_MC 0 /* External DRAM */ #define DLA_MEM_CV 1 /* CV-SRAM */ #define DLA_MEM_HW 2 /* DLA sub-module */ /** @} */ /** * @ingroup Events * @name Operation events * @brief Different events triggered by an operations * @{ */ #define DLA_EVENT_OP_COMPLETED 1 #define DLA_EVENT_OP_PROGRAMMED 2 #define DLA_EVENT_OP_ENABLED 3 #define DLA_EVENT_CDMA_WT_DONE 4 #define DLA_EVENT_CDMA_DT_DONE 5 /** @} */ struct dla_consumer { int16_t index; /* the index of dla_common_op_desc in dep_graph_addr */ uint8_t event; uint8_t res; } __packed __aligned(4); struct dla_common_op_desc { int16_t index; /* set by ucode */ int8_t roi_index; uint8_t op_type; uint8_t dependency_count; uint8_t reserved0[3]; struct dla_consumer consumers[DLA_OP_NUM]; struct dla_consumer fused_parent; } __packed __aligned(4); struct dla_roi_array_desc { uint32_t array_length; uint32_t array_reserved; } __packed __aligned(4); struct dla_roi_desc { uint32_t left; uint32_t top; uint32_t right; uint32_t bottom; } __packed __aligned(4); /** * @ingroup BDMA * @name Maximum BDMA transfers * @brief BDMA supports multiple transfers in operation. This indicates * maximum number of transfers possible in one operation. * @{ */ #define NUM_MAX_BDMA_OPS 20 /** @} */ struct dla_bdma_transfer_desc { int16_t source_address; int16_t destination_address; uint32_t line_size; uint32_t line_repeat; uint32_t source_line; uint32_t destination_line; uint32_t surface_repeat; uint32_t source_surface; uint32_t destination_surface; } __packed __aligned(4); struct dla_bdma_surface_desc { uint8_t source_type; uint8_t destination_type; uint16_t num_transfers; struct dla_bdma_transfer_desc transfers[NUM_MAX_BDMA_OPS]; } __packed __aligned(4); struct dla_bdma_op_desc { uint16_t num_transfers; uint16_t reserved0; } __packed __aligned(4); struct dla_bdma_stat_desc { uint32_t read_stall; uint32_t write_stall; uint32_t runtime; } __packed __aligned(4); /** * @ingroup Convolution * @name Convolution mode * @brief Convolution modes support by DLA * @{ */ #define CONV_MODE_DIRECT 0 #define CONV_MODE_WINOGRAD 1 /** @} */ /** * @ingroup Processors * @name Precision BPE mapping * @brief Precision formats and Bit Per Elements mapping * @{ */ #define BPE_PRECISION_INT8 1 #define BPE_PRECISION_INT16 2 #define BPE_PRECISION_FP16 2 /** @} */ /** * @ingroup Processors * @name Precision types * @brief Precision formats supported by DLA engine * @{ */ #define PRECISION_INT8 0 #define PRECISION_INT16 1 #define PRECISION_FP16 2 /** @} */ /** * @ingroup Processors * @name Data formats * @brief Data formats supported by DLA engine * @{ */ #define FORMAT_T_R8 0 #define FORMAT_T_R10 1 #define FORMAT_T_R12 2 #define FORMAT_T_R16 3 #define FORMAT_T_R16_I 4 #define FORMAT_T_R16_F 5 #define FORMAT_T_A16B16G16R16 6 #define FORMAT_T_X16B16G16R16 7 #define FORMAT_T_A16B16G16R16_F 8 #define FORMAT_T_A16Y16U16V16 9 #define FORMAT_T_V16U16Y16A16 10 #define FORMAT_T_A16Y16U16V16_F 11 #define FORMAT_T_A8B8G8R8 12 #define FORMAT_T_A8R8G8B8 13 #define FORMAT_T_B8G8R8A8 14 #define FORMAT_T_R8G8B8A8 15 #define FORMAT_T_X8B8G8R8 16 #define FORMAT_T_X8R8G8B8 17 #define FORMAT_T_B8G8R8X8 18 #define FORMAT_T_R8G8B8X8 19 #define FORMAT_T_A2B10G10R10 20 #define FORMAT_T_A2R10G10B10 21 #define FORMAT_T_B10G10R10A2 22 #define FORMAT_T_R10G10B10A2 23 #define FORMAT_T_A2Y10U10V10 24 #define FORMAT_T_V10U10Y10A2 25 #define FORMAT_T_A8Y8U8V8 26 #define FORMAT_T_V8U8Y8A8 27 #define FORMAT_T_Y8___U8V8_N444 28 #define FORMAT_T_Y8___V8U8_N444 29 #define FORMAT_T_Y10___U10V10_N444 30 #define FORMAT_T_Y10___V10U10_N444 31 #define FORMAT_T_Y12___U12V12_N444 32 #define FORMAT_T_Y12___V12U12_N444 33 #define FORMAT_T_Y16___U16V16_N444 34 #define FORMAT_T_Y16___V16U16_N444 35 #define FORMAT_FEATURE 36 /** @} */ /** * @ingroup Convolution * @name Pixel mapping * @brief Pixel mapping formats supported for image input in Convolution * @{ */ #define MAP_PITCH_LINEAR 0 /** @} */ /** * @ingroup Convolution * @name Weight formats * @brief Weight data formats supported in Convolution * @{ */ #define WEIGHT_FORMAT_UNCOMPRESSED 0 #define WEIGHT_FORMAT_COMPRESSED 1 /** @} */ /** * @ingroup Convolution * @name Mean data format * @brief Mean data formats supported in Convolution * @{ */ #define MEAN_FORMAT_DISABLE 0 #define MEAN_FORMAT_ENABLE 1 /** @} */ struct dla_cvt_param { int16_t scale; uint8_t truncate; uint8_t enable; int32_t offset; } __packed __aligned(4); struct dla_data_cube { uint16_t type; /* dla_mem_type */ int16_t address; /* offset to the actual IOVA in task.address_list */ uint32_t offset; /* offset within address */ uint32_t size; /* cube dimensions */ uint16_t width; uint16_t height; uint16_t channel; uint16_t reserved0; /* stride information */ uint32_t line_stride; uint32_t surf_stride; /* For Rubik only */ uint32_t plane_stride; } __packed __aligned(4); #define PIXEL_OVERRIDE_UINT 0 #define PIXEL_OVERRIDE_INT 1 struct dla_conv_surface_desc { /* Data cube */ struct dla_data_cube weight_data; struct dla_data_cube wmb_data; struct dla_data_cube wgs_data; struct dla_data_cube src_data; struct dla_data_cube dst_data; /** * u_addr = input_data.source_addr + offset_u * this field should be set when YUV is not interleave format * */ int64_t offset_u; /* line stride for 2nd plane, must be 32bytes aligned */ uint32_t in_line_uv_stride; } __packed __aligned(4); struct dla_conv_op_desc { /* Performance parameters */ /* dla_conv_mode */ uint8_t conv_mode; uint8_t data_reuse; uint8_t weight_reuse; uint8_t skip_data_rls; uint8_t skip_weight_rls; uint8_t reserved0; uint16_t entry_per_slice; /* dla_data_format */ uint8_t data_format; /* dla_pixel_mapping */ uint8_t pixel_mapping; /* number of free slices before fetch */ uint16_t fetch_grain; uint8_t reserved_b[8]; /* batch_num */ uint8_t batch; /* dla_weight_format */ uint8_t weight_format; uint8_t data_bank; uint8_t weight_bank; /* the offset in bytes of each data cube in a batch */ uint32_t batch_stride; uint8_t post_extension; uint8_t pixel_override; /* number of slices need to be released */ uint16_t release; /* The input cube dimension for CSC */ uint16_t input_width_csc; uint16_t input_height_csc; uint16_t input_channel_csc; uint16_t kernel_width_csc; uint16_t kernel_height_csc; uint16_t kernel_channel_csc; /* The input cube dimension for CMAC */ uint16_t input_width_cmac; uint16_t input_height_cmac; /* actual size in bytes */ uint32_t bytes_per_kernel; /* Algorithm parameters */ int16_t mean_ry; /* mean value for red in RGB or Y in YUV */ int16_t mean_gu; /* mean value for green in RGB or U in YUV */ int16_t mean_bv; /* mean value for blue in RGB or V in YUV */ int16_t mean_ax; uint8_t mean_format; /* dla_mean_format */ uint8_t conv_stride_x; uint8_t conv_stride_y; uint8_t pad_x_left; uint8_t pad_x_right; uint8_t pad_y_top; uint8_t pad_y_bottom; uint8_t dilation_x; uint8_t dilation_y; uint8_t reserved2[2]; /* Precision parameters */ uint8_t pra_truncate; uint8_t in_precision; /* The output precision from CONV, it's the MAC processing precison */ uint8_t out_precision; int16_t pad_val; /* input converter parameters */ struct dla_cvt_param in_cvt; /* output converter parameters, support truncate only */ struct dla_cvt_param out_cvt; } __packed __aligned(4); struct dla_conv_stat_desc { uint32_t data_read_stall; uint32_t weight_read_stall; uint32_t data_read_latency; uint32_t weight_read_latency; uint32_t saturation_count; uint32_t nan_data_num; uint32_t nan_weight_num; uint32_t inf_data_num; uint32_t inf_weight_num; uint32_t runtime; } __packed __aligned(4); /** * @ingroup SDP * @name Activation functions * @brief Activation functions supported in SDP * @{ */ #define ACTIVATION_NONE 0 #define ACTIVATION_RELU 1 #define ACTIVATION_LUT 2 #define ACTIVATION_PRELU 3 /** @} */ /** * @ingroup LUT * @name LUT size * @brief LUT sizes for linear and exponentila LUT * @{ */ #define LUT_LINEAR_EXP_TABLE_ENTRY_LOG2 6 #define LUT_LINEAR_ONLY_TABLE_ENTRY_LOG2 8 /** @} */ /** * @ingroup LUT * @name LUT types * @brief DLA supports two types of LUT, linear and exonential * @{ */ #define LUT_LINEAR_EXP_TABLE 0 #define LUT_LINEAR_ONLY_TABLE 1 /** @} */ /** * @ingroup LUT * @name LUT methods * @brief DLA supports two types of LUT, linear and exonential * @{ */ #define LUT_METHOD_EXPONENTIAL 0 #define LUT_METHOD_LINEAR 1 /** @} */ /** * @ingroup LUT * @name LUT * @brief DLA supports two types of LUT, linear and exonential * @{ */ #define LUT_PRI_LINEAR_EXP 0 #define LUT_PRI_LINEAR_ONLY 1 /** @} */ union dla_lut_offset { /** * Number should be substracted on log domain before look up * exponetial table it has the same definition as hardware * thus input scaling should also take into account when * set this field. */ int8_t exp_offset; /** * Number of bits should be right shift before looking * up linear table */ int8_t frac_bits; uint16_t reserved0; }; /** * This struct is used to represent floating point values by INT * suppose we have a float point number fp_x, it will be represented * as: * * fp_x = scale_int_x>>(shifter_x) * * This is very useful for INT pipeline; */ struct dla_float_data { int16_t scale; int8_t shifter; uint8_t reserved0; } __packed __aligned(4); /** * For INT pipeline, we use the struct above to represent a floating number; * For FP16 pipeline, we should store the FP16 encoded value into a uint16_t * container */ union dla_slope { struct dla_float_data data_i; uint16_t data_f; }; struct dla_lut_param { /** * value of expression ((1< 1 */ /* Algorithm parameters */ struct dla_sdp_op x1_op; struct dla_sdp_op x2_op; struct dla_sdp_op y_op; } __packed __aligned(4); struct dla_sdp_stat_desc { uint32_t nan_input_num; uint32_t inf_input_num; uint32_t nan_output_num; uint32_t wdma_write_stall; uint32_t lut_underflow; uint32_t lut_overflow; uint32_t lut_hybrid; uint32_t lut_le_hit; uint32_t lut_lo_hit; uint32_t saturation_count; uint32_t runtime; } __packed __aligned(4); #define POOL_MODE_AVG 0 #define POOL_MODE_MAX 1 #define POOL_MODE_MIN 2 #define POOL_SIZE_1 0 #define POOL_SIZE_2 1 #define POOL_SIZE_3 2 #define POOL_SIZE_4 3 #define POOL_SIZE_5 4 #define POOL_SIZE_6 5 #define POOL_SIZE_7 6 #define POOL_SIZE_8 7 #define PDP_PAD_VAL_NUM 7 struct dla_pdp_surface_desc { /* Data cube */ struct dla_data_cube src_data; struct dla_data_cube dst_data; } __packed __aligned(4); struct dla_pdp_op_desc { /* Performance parameters */ uint16_t partial_in_width_first; uint16_t partial_in_width_mid; uint16_t partial_in_width_last; uint16_t partial_width_first; uint16_t partial_width_mid; uint16_t partial_width_last; uint8_t split_num; /* Algorithm parameters */ uint8_t pool_mode; /* dla_pool_mode */ uint8_t pool_width; /* dla_pool_width */ uint8_t pool_height; /* dla_pool_height */ uint8_t stride_x; uint8_t stride_y; /** * The left/right padding size, * pad_right might be less than pad_left */ uint8_t pad_left; uint8_t pad_right; /* The top/bottom padding size */ uint8_t pad_top; uint8_t pad_bottom; /* Precision parameters */ uint8_t precision; /* dla_precision */ uint8_t reserved0; /** * if input has non-zero "offset", this value should be set * There'll be 7 different paddding values, the relationship between * those versions are: * padding_value[0] = -offset*scaling; * padding_value[1] = 2*padding_value[0] * padding_value[2] = 3*padding_value[0] * ... * The purpose is to avoid ucode implement FP16 * multiplier(for FP16 mode) */ int32_t padding_value[PDP_PAD_VAL_NUM]; } __packed __aligned(4); struct dla_pdp_stat_desc { uint32_t inf_input_num; uint32_t nan_input_num; uint32_t nan_output_num; uint32_t write_stall; uint32_t runtime; } __packed __aligned(4); struct dla_cdp_surface_desc { /* Data cube */ struct dla_data_cube src_data; struct dla_data_cube dst_data; } __packed __aligned(4); struct dla_cdp_op_desc { /* Precision parameters */ /* dla_precision */ uint8_t in_precision; uint8_t out_precision; int16_t lut_index; struct dla_cvt_param in_cvt; struct dla_cvt_param out_cvt; /* Performance parameters */ /* Algorithm parameters */ uint8_t local_size; uint8_t bypass_sqsum; uint8_t bypass_out_mul; uint8_t reserved0; } __packed __aligned(4); struct dla_cdp_stat_desc { uint32_t nan_input_num; uint32_t inf_input_num; uint32_t nan_output_num; uint32_t write_stall; uint32_t lut_uflow; uint32_t lut_oflow; uint32_t lut_hybrid; uint32_t lut_le_hit; uint32_t lut_lo_hit; uint32_t saturation_count; uint32_t runtime; } __packed __aligned(4); struct dla_rubik_surface_desc { /* Data cube */ struct dla_data_cube src_data; struct dla_data_cube dst_data; } __packed __aligned(4); /* rubik mode */ #define RUBIK_MODE_CONTRACT 0 #define RUBIK_MODE_SPLIT 1 #define RUBIK_MODE_MERGE 2 struct dla_rubik_op_desc { /* Precision parameters */ uint8_t mode; uint8_t precision; uint8_t stride_x; uint8_t stride_y; } __packed __aligned(4); struct dla_rubik_stat_desc { uint32_t read_stall; uint32_t write_stall; uint32_t runtime; } __packed __aligned(4); union dla_surface_container { struct dla_bdma_surface_desc bdma_surface; struct dla_conv_surface_desc conv_surface; struct dla_sdp_surface_desc sdp_surface; struct dla_pdp_surface_desc pdp_surface; struct dla_cdp_surface_desc cdp_surface; struct dla_rubik_surface_desc rubik_surface; }; union dla_operation_container { struct dla_bdma_op_desc bdma_op; struct dla_conv_op_desc conv_op; struct dla_sdp_op_desc sdp_op; struct dla_pdp_op_desc pdp_op; struct dla_cdp_op_desc cdp_op; struct dla_rubik_op_desc rubik_op; }; union dla_stat_container { struct dla_bdma_stat_desc bdma_stat; struct dla_conv_stat_desc conv_stat; struct dla_sdp_stat_desc sdp_stat; struct dla_pdp_stat_desc pdp_stat; struct dla_cdp_stat_desc cdp_stat; struct dla_rubik_stat_desc rubik_stat; }; /** * status notifier structure * * @address: 64-bit timestamp representing the time at which * the notifier was written * @status_engine: status work captured from HW engine * @subframe: NA * @status_task: status word as configured from an action list */ struct dla_task_status { uint64_t timestamp; uint32_t status_engine; uint16_t subframe; uint16_t status_task; } __packed __aligned(4); #endif