From c9e99bd603ea358eaa1c54505fd2d26faa3d9d4e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 24 Jun 2024 22:11:28 +0800 Subject: [PATCH] split qnn ops into file --- ggml-qnn.cpp | 723 +--------------------------------- ggml-qnn/backend-ops.cpp | 675 +++++++++++++++++++++++++++++++ ggml-qnn/backend-ops.hpp | 17 + ggml-qnn/backend.hpp | 5 - ggml-qnn/qnn.hpp | 13 +- ggml-qnn/tensor.hpp | 1 + ggml-qnn/utils.cpp | 126 ++++++ ggml-qnn/utils.hpp | 172 +++----- tests/ggml-qnn/CMakeLists.txt | 2 + 9 files changed, 889 insertions(+), 845 deletions(-) create mode 100644 ggml-qnn/backend-ops.cpp create mode 100644 ggml-qnn/backend-ops.hpp create mode 100644 ggml-qnn/utils.cpp diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index ffa43718410ab..750d5ff91c3d3 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1,22 +1,14 @@ #include #include -#include #include #include -#include -#include -#include #include #include -#include -#include #include -#include #include #include #include -#include #include #include #include @@ -28,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -40,8 +31,9 @@ #include "ggml-qnn/logger.hpp" #include "ggml-qnn/utils.hpp" -#include "ggml-qnn/backend.hpp" #include "ggml-qnn/tensor.hpp" +#include "ggml-qnn/backend.hpp" +#include "ggml-qnn/backend-ops.hpp" // ================================================================================================= // @@ -63,11 +55,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); #define QNN_BACKEND_NAME "qnn" -typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - static struct qnn::qcom_socinfo g_qnn_soc_info_table[] = { /* Qualcomm SnapDragon 8 Gen 1 */ [qnn::SM8450] = { @@ -183,78 +170,6 @@ struct ggml_backend_qnn_buffer_type_context { // QNN backend internal helper functions // // ================================================================================================= -static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("invalid params\n"); - return false; - } - - qnn::qnn_instance *instance = nullptr; - Qnn_Tensor_t *tensor_0 = nullptr; - Qnn_Tensor_t *tensor_1 = nullptr; - Qnn_Tensor_t *tensor_2 = nullptr; - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { - QNN_LOG_WARN("invalid params\n"); - return false; - } - - return true; -} - -#ifndef NDEBUG -#define CHECK_PARAMS(ctx, src0, src1, dst) \ - do { \ - if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ - return; \ - } \ - } while (0) - -#else -#define CHECK_PARAMS(ctx, src0, src1, dst) -#endif - -#if ENABLE_QNNBACKEND_PERF -class qnn_perf { -public: - qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; - qnn_perf() = delete; - qnn_perf(const qnn_perf & ) = delete; - qnn_perf & operator= (const qnn_perf & ) = delete; - - void start() { - _begin_time = ggml_time_us(); - } - - void info() { - _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time); - QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); - } - -private: - int64_t _begin_time = 0LL; - int64_t _end_time = 0LL; - int64_t _duration = 0LL; - std::string _perf_name; -}; -#else -class qnn_perf { -public: - qnn_perf(const std::string & perf_name) {} - qnn_perf() = delete; - qnn_perf(const qnn_perf & ) = delete; - qnn_perf & operator= (const qnn_perf & ) = delete; - - void start() {} - void info() {} -}; -#endif - static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { if (!dst || !src || !dst_size || !copy_size) return 0; @@ -354,100 +269,10 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { // implementation of QNN backend for GGML // // ================================================================================================= -static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst); -static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); - -static ggml_qnn_func_t s_op_table[GGML_OP_COUNT] = { - nullptr, // GGML_OP_NONE - nullptr, // GGML_OP_DUP - ggml_qnn_add, // GGML_OP_ADD - nullptr, // GGML_OP_ADD1 - nullptr, // GGML_OP_ACC - nullptr, // GGML_OP_SUB - nullptr, // GGML_OP_MUL - nullptr, // GGML_OP_DIV - nullptr, // GGML_OP_SQR - nullptr, // GGML_OP_SQRT - nullptr, // GGML_OP_LOG - nullptr, // GGML_OP_SUM - nullptr, // GGML_OP_SUM_ROWS - nullptr, // GGML_OP_MEAN - nullptr, // GGML_OP_ARGMAX - nullptr, // GGML_OP_REPEAT - nullptr, // GGML_OP_REPEAT_BACK - nullptr, // GGML_OP_CONCAT - nullptr, // GGML_OP_SILU_BACK - nullptr, // GGML_OP_NORM - nullptr, // GGML_OP_RMS_NORM - nullptr, // GGML_OP_RMS_NORM_BACK - nullptr, // GGML_OP_GROUP_NORM - - ggml_qnn_mul_mat, // GGML_OP_MUL_MAT - nullptr, // GGML_OP_MUL_MAT_ID - nullptr, // GGML_OP_OUT_PROD - - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - nullptr, // GGML_OP_VIEW - nullptr, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - nullptr, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU - - nullptr, // GGML_OP_FLASH_ATTN_EXT - nullptr, // GGML_OP_FLASH_ATTN_BACK - nullptr, // GGML_OP_SSM_CONV - nullptr, // GGML_OP_SSM_SCAN - nullptr, // GGML_OP_WIN_PART - nullptr, // GGML_OP_WIN_UNPART - nullptr, // GGML_OP_GET_REL_POS - nullptr, // GGML_OP_ADD_REL_POS - - nullptr, // GGML_OP_UNARY - - nullptr, // GGML_OP_MAP_UNARY - nullptr, // GGML_OP_MAP_BINARY - - nullptr, // GGML_OP_MAP_CUSTOM1_F32 - nullptr, // GGML_OP_MAP_CUSTOM2_F32 - nullptr, // GGML_OP_MAP_CUSTOM3_F32 - - nullptr, // GGML_OP_MAP_CUSTOM1 - nullptr, // GGML_OP_MAP_CUSTOM2 - nullptr, // GGML_OP_MAP_CUSTOM3 - - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS - nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK -}; - static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - if (ggml_is_empty(tensor) || !s_op_table[tensor->op]) { + if (ggml_is_empty(tensor) || !qnn::ggml_qnn_op_array()[tensor->op]) { return false; } @@ -496,550 +321,10 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } - -//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat -// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC -static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - - qnn_perf perf("ggml_qnn_add"); - perf.start(); - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; // 1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL}; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } - - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); - goto failure; - } else { - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); - } - - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); - if (!tensor_input1.is_valid()) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); - if (!tensor_output.is_valid()) { - goto failure; - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_add", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, - 0, qnn_params, - 2, tensor_inputs, - 1,tensor_outputs} - }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs,2, - tensor_outputs,1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), - dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - } - - perf.info(); -} - -/* - * ggml_qnn_mul_mat was re-added as a standalone function because - * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 - * MUL_MAT take most of the compute time (about 95%). - * So to speed up llama, we have to focus on MUL_MAT. - * - * We have three kinds of MUL_MAT to compute: - * mul_mat_f32: both src0 and src1 are F32. - * mul_mat_f16_f32: src0 is F16 and src1 is F32. - * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. - */ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn::qnn_instance *instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_MUL_MAT; - - CHECK_PARAMS(ctx, src0, src1, dst); - instance = ctx->instance; - auto qnn_raw_interface = ctx->raw_interface; - - qnn_perf perf("ggml_qnn_mul_mat"); - perf.start(); - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - //TODO: for scenarios of quantized data in src0 - // pass-1: dequantize src0 to FP32 - // pass-2: dq-src0 * src1 - // the performance gains is worth although there is performance loss in pass-1 - - if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + - "_" + src0->name + "_" + src1->name; - QNN_LOG_INFO("graph name %s", graph_name.c_str()); - if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t hvx_config; - hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - hvx_config.numHvxThreads = 8; - QnnGraph_Config_t graph_hvx_config; - graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_hvx_config.customConfig = &hvx_config; - - QnnHtpGraph_CustomConfig_t dlbc_config; - dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC - QnnGraph_Config_t graph_dlbc_config; - graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_dlbc_config.customConfig = &dlbc_config; - - QnnHtpGraph_CustomConfig_t opt_config; - opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - opt_config.optimizationOption.floatValue = 1; //1 / 3 - QnnGraph_Config_t graph_opt_config; - graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_opt_config.customConfig = &opt_config; - - QnnHtpGraph_CustomConfig_t vtcm_config; - vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; - vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; - QnnGraph_Config_t graph_vtcm_config; - graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_vtcm_config.customConfig = &vtcm_config; - - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, - &graph_dlbc_config, - &graph_vtcm_config, - &graph_opt_config, - NULL}; - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, - &graph_handle); - } else { - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); - goto failure; - } - - qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); - if (!tensor_input0.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); - if (!tensor_input1.is_valid()) { - goto failure; - } - qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); - if (!tensor_output.is_valid()) { - goto failure; - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, - 0, qnn_params, - 2, tensor_inputs, - 1, tensor_outputs} - }; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - auto graph_item = std::make_tuple(graph_handle, - tensor_input0.get_qnn_tensor(), - tensor_input1.get_qnn_tensor(), - tensor_output.get_qnn_tensor()); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item= instance->_qnn_graph_map[map_entry]; - qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); - qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); - qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); - - Qnn_Tensor_t tensor_inputs[] = {*tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor()}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_output.get_qnn_tensor()}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (ctx->device == QNN_BACKEND_NPU) { - if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); - } - } - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - } - - perf.info(); -} - -static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - ggml_qnn_cpy(ctx, src0, dst, nullptr); - (void) src1; -} - -static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { -} - -static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { -} - -static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); -} - -static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, ggml_tensor * dst) { - (void)src0; - (void)src1; - (void)dst; -} - bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { - ggml_qnn_func_t func = s_op_table[tensor->op]; + auto func = qnn::ggml_qnn_op_array()[tensor->op]; if (!func) { QNN_LOG_WARN("unsupported op %d", tensor->op); return false; diff --git a/ggml-qnn/backend-ops.cpp b/ggml-qnn/backend-ops.cpp new file mode 100644 index 0000000000000..a9c94a6df3102 --- /dev/null +++ b/ggml-qnn/backend-ops.cpp @@ -0,0 +1,675 @@ + +#include "backend-ops.hpp" + +#include "utils.hpp" +#include "logger.hpp" +#include "tensor.hpp" + + +static bool qnn_is_valid_params(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + qnn::qnn_instance* instance = nullptr; + Qnn_Tensor_t* tensor_0 = nullptr; + Qnn_Tensor_t* tensor_1 = nullptr; + Qnn_Tensor_t* tensor_2 = nullptr; + tensor_0 = (Qnn_Tensor_t*)src0->extra; + tensor_1 = (Qnn_Tensor_t*)src1->extra; + tensor_2 = (Qnn_Tensor_t*)dst->extra; + instance = ctx->instance; + if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("invalid params\n"); + return false; + } + + return true; +} + +#ifndef NDEBUG +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + +//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat +// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC +static void ggml_qnn_add(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance* instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + + CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + auto qnn_raw_interface = ctx->raw_interface; + + qnn::qnn_perf perf("ggml_qnn_add"); + perf.start(); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto& graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + "_" + src0->name + "_" + src1->name; + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL }; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } + else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); + goto failure; + } + else { + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); + } + + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { + goto failure; + } + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { + goto failure; + } + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t)1, + .v1 = {"ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, qnn_params, + 2, tensor_inputs, + 1,tensor_outputs} + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + + auto graph_item = std::make_tuple(graph_handle, + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); + instance->_qnn_graph_map[map_entry] = graph_item; + } + else { + auto& graph_item = instance->_qnn_graph_map[map_entry]; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + } + +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), + dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + } + + perf.info(); +} + +/* + * ggml_qnn_mul_mat was re-added as a standalone function because + * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 + * MUL_MAT take most of the compute time (about 95%). + * So to speed up llama, we have to focus on MUL_MAT. + * + * We have three kinds of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f16_f32: src0 is F16 and src1 is F32. + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. + */ +static void ggml_qnn_mul_mat(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + qnn::qnn_instance* instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; + + CHECK_PARAMS(ctx, src0, src1, dst); + instance = ctx->instance; + auto qnn_raw_interface = ctx->raw_interface; + + qnn::qnn_perf perf("ggml_qnn_mul_mat"); + perf.start(); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto& graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + //TODO: for scenarios of quantized data in src0 + // pass-1: dequantize src0 to FP32 + // pass-2: dq-src0 * src1 + // the performance gains is worth although there is performance loss in pass-1 + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + "_" + src0->name + "_" + src1->name; + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; //1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t* p_graphconfig[] = { &graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL }; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } + else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); + goto failure; + } + + qnn::ggml_qnn_tensor_input tensor_input0(src0, graph_handle, ctx); + if (!tensor_input0.is_valid()) { + goto failure; + } + qnn::ggml_qnn_tensor_input tensor_input1(src1, graph_handle, ctx); + if (!tensor_input1.is_valid()) { + goto failure; + } + qnn::ggml_qnn_tensor_output tensor_output(dst, graph_handle, ctx); + if (!tensor_output.is_valid()) { + goto failure; + } + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t)1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, qnn_params, + 2, tensor_inputs, + 1, tensor_outputs} + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + + auto graph_item = std::make_tuple(graph_handle, + tensor_input0.get_qnn_tensor(), + tensor_input1.get_qnn_tensor(), + tensor_output.get_qnn_tensor()); + instance->_qnn_graph_map[map_entry] = graph_item; + } + else { + auto& graph_item = instance->_qnn_graph_map[map_entry]; + qnn::ggml_qnn_tensor_input tensor_input0(src0, std::get<1>(graph_item), ctx); + qnn::ggml_qnn_tensor_input tensor_input1(src1, std::get<2>(graph_item), ctx); + qnn::ggml_qnn_tensor_output tensor_output(dst, std::get<3>(graph_item), ctx); + + Qnn_Tensor_t tensor_inputs[] = { *tensor_input0.get_qnn_tensor(), *tensor_input1.get_qnn_tensor() }; + Qnn_Tensor_t tensor_outputs[] = { *tensor_output.get_qnn_tensor() }; + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + goto failure; + } + } + +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + } + + perf.info(); +} + +static void ggml_qnn_repeat(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_get_rows(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_acc(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_div(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_gelu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_silu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_tanh(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_relu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_hardswish(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_sqr(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_norm(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_group_norm(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_concat(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_upscale(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_pad(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_rms_norm(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_cpy(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_dup(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + ggml_qnn_cpy(ctx, src0, dst, nullptr); + (void)src1; +} + +static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_scale(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_clamp(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { +} + +static void ggml_qnn_soft_max(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_rope(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); +} + +static void ggml_qnn_pool2d(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_im2col(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { +} + +static void ggml_qnn_sum_rows(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); +} + +static void ggml_qnn_argsort(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, const ggml_tensor* src1, + ggml_tensor* dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); +} + +static void ggml_qnn_nop(ggml_backend_qnn_context* ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst) { + (void)src0; + (void)src1; + (void)dst; +} + +qnn::ggml_qnn_op_array_t qnn::ggml_qnn_op_array() { + static constexpr const qnn::ggml_qnn_op_t kQnnOpsTable[GGML_OP_COUNT] = { + nullptr, // GGML_OP_NONE + nullptr, // GGML_OP_DUP + ggml_qnn_add, // GGML_OP_ADD + nullptr, // GGML_OP_ADD1 + nullptr, // GGML_OP_ACC + nullptr, // GGML_OP_SUB + nullptr, // GGML_OP_MUL + nullptr, // GGML_OP_DIV + nullptr, // GGML_OP_SQR + nullptr, // GGML_OP_SQRT + nullptr, // GGML_OP_LOG + nullptr, // GGML_OP_SUM + nullptr, // GGML_OP_SUM_ROWS + nullptr, // GGML_OP_MEAN + nullptr, // GGML_OP_ARGMAX + nullptr, // GGML_OP_REPEAT + nullptr, // GGML_OP_REPEAT_BACK + nullptr, // GGML_OP_CONCAT + nullptr, // GGML_OP_SILU_BACK + nullptr, // GGML_OP_NORM + nullptr, // GGML_OP_RMS_NORM + nullptr, // GGML_OP_RMS_NORM_BACK + nullptr, // GGML_OP_GROUP_NORM + + ggml_qnn_mul_mat, // GGML_OP_MUL_MAT + nullptr, // GGML_OP_MUL_MAT_ID + nullptr, // GGML_OP_OUT_PROD + + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + nullptr, // GGML_OP_RESHAPE + nullptr, // GGML_OP_VIEW + nullptr, // GGML_OP_PERMUTE + nullptr, // GGML_OP_TRANSPOSE + nullptr, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU + + nullptr, // GGML_OP_FLASH_ATTN_EXT + nullptr, // GGML_OP_FLASH_ATTN_BACK + nullptr, // GGML_OP_SSM_CONV + nullptr, // GGML_OP_SSM_SCAN + nullptr, // GGML_OP_WIN_PART + nullptr, // GGML_OP_WIN_UNPART + nullptr, // GGML_OP_GET_REL_POS + nullptr, // GGML_OP_ADD_REL_POS + + nullptr, // GGML_OP_UNARY + + nullptr, // GGML_OP_MAP_UNARY + nullptr, // GGML_OP_MAP_BINARY + + nullptr, // GGML_OP_MAP_CUSTOM1_F32 + nullptr, // GGML_OP_MAP_CUSTOM2_F32 + nullptr, // GGML_OP_MAP_CUSTOM3_F32 + + nullptr, // GGML_OP_MAP_CUSTOM1 + nullptr, // GGML_OP_MAP_CUSTOM2 + nullptr, // GGML_OP_MAP_CUSTOM3 + + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS + nullptr, // GGML_OP_CROSS_ENTROPY_LOSS_BACK + }; + + return kQnnOpsTable; +} diff --git a/ggml-qnn/backend-ops.hpp b/ggml-qnn/backend-ops.hpp new file mode 100644 index 0000000000000..c3dd5de302289 --- /dev/null +++ b/ggml-qnn/backend-ops.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "ggml.h" +#include "backend.hpp" + +namespace qnn { + + typedef void (*ggml_qnn_op_t)(ggml_backend_qnn_context* ctx, + const ggml_tensor* src0, + const ggml_tensor* src1, + ggml_tensor* dst); + + typedef const ggml_qnn_op_t(&ggml_qnn_op_array_t)[GGML_OP_COUNT]; + + ggml_qnn_op_array_t ggml_qnn_op_array(); + +} diff --git a/ggml-qnn/backend.hpp b/ggml-qnn/backend.hpp index b5aacf57c1aa0..dc40090ee6114 100644 --- a/ggml-qnn/backend.hpp +++ b/ggml-qnn/backend.hpp @@ -1,11 +1,6 @@ #pragma once -#include "QnnTypes.h" -#include "QnnCommon.h" -#include "QnnContext.h" -#include "QnnBackend.h" - #include "ggml.h" #include "ggml-backend.h" diff --git a/ggml-qnn/qnn.hpp b/ggml-qnn/qnn.hpp index 212b6f8521745..6caefb75644f7 100644 --- a/ggml-qnn/qnn.hpp +++ b/ggml-qnn/qnn.hpp @@ -1,21 +1,27 @@ #pragma once +#include #include +#include +#include +#include // header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #include "QnnTypes.h" #include "QnnCommon.h" +#include "QnnInterface.h" #include "QnnContext.h" #include "QnnBackend.h" #include "QnnGraph.h" #include "QnnProperty.h" #include "QnnTensor.h" +#include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" #include "HTP/QnnHtpGraph.h" +#include "qnn-types.hpp" #include "utils.hpp" -#include "logger.hpp" namespace qnn { @@ -864,9 +870,8 @@ namespace qnn { const qnn::qcom_socinfo& get_soc_info() { return _soc_info; } public: - std::map> - _qnn_graph_map; + std::map> _qnn_graph_map; private: int load_system() { diff --git a/ggml-qnn/tensor.hpp b/ggml-qnn/tensor.hpp index 687ebd8905ef4..de0d1dc2dbbef 100644 --- a/ggml-qnn/tensor.hpp +++ b/ggml-qnn/tensor.hpp @@ -4,6 +4,7 @@ #include "QnnTensor.h" #include "System/QnnSystemInterface.h" +#include "ggml-qnn.h" #include "backend.hpp" #include "qnn.hpp" diff --git a/ggml-qnn/utils.cpp b/ggml-qnn/utils.cpp new file mode 100644 index 0000000000000..798445c02fd76 --- /dev/null +++ b/ggml-qnn/utils.cpp @@ -0,0 +1,126 @@ + +#include "utils.hpp" + +#include "ggml-qnn.h" +#include "qnn-types.hpp" + +namespace qnn { + + // TODO: mapping more ggml data type to QNN data type + // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 + Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; + } + return QNN_DATATYPE_UNDEFINED; + } + + + uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; + } + + + const char* get_backend_name(int n_backend_type) { + switch (n_backend_type) { + case QNN_BACKEND_CPU: + return "QNN-CPU"; + case QNN_BACKEND_GPU: + return "QNN-GPU"; + case QNN_BACKEND_NPU: + return "QNN-NPU"; + case QNN_BACKEND_GGML: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; + } + } + + const char* get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; + } + } + + const char* get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + default: + return "unknown"; + } + } + + intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); + } + + uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); + } + + // ================================================================================================= + // + // QNN backend internal helper functions + // + // ================================================================================================= + // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT + const char* opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + return nullptr; + } + +} diff --git a/ggml-qnn/utils.hpp b/ggml-qnn/utils.hpp index 2ec7c0f13f0ce..4889c6dc8601c 100644 --- a/ggml-qnn/utils.hpp +++ b/ggml-qnn/utils.hpp @@ -1,135 +1,34 @@ #pragma once +#include +#include +#include +#include +#include +#include + #include "QnnTypes.h" #include "ggml.h" -#include "qnn-types.hpp" +#include "logger.hpp" namespace qnn { - // TODO: mapping more ggml data type to QNN data type - // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 - Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - case GGML_TYPE_Q4_0: - return QNN_DATATYPE_SFIXED_POINT_4; - default: - break; - } - return QNN_DATATYPE_UNDEFINED; - } - - - uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; - } + Qnn_DataType_t datatype_from_ggml_datatype(enum ggml_type ggmltype); + uint32_t get_ggml_tensor_rank(const ggml_tensor* tensor); + const char* get_backend_name(int n_backend_type); + const char* get_chipset_desc(uint32_t chipset_id); + const char* get_htparch_desc(size_t htp_arch); + intptr_t align_to(size_t alignment, intptr_t offset); + uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor); - - const char* get_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - default: - return "unknown"; - } - } - - const char* get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { - case SM8450: - return "SM8450"; - case SM8475: - return "SM8475"; - case SM8550: - return "SM8550"; - case SM8650: - return "SM8650"; - default: - return "unknown"; - } - } - - const char* get_htparch_desc(size_t htp_arch) { - switch (htp_arch) { - case V68: - return "QCOM_HTP_V68"; - case V69: - return "QCOM_HTP_V69"; - case V73: - return "QCOM_HTP_V73"; - case V75: - return "QCOM_HTP_V75"; - default: - return "unknown"; - } - } + const char* opname_from_ggmlop(enum ggml_op ggmlop); template Fn load_qnn_functionpointers(void* handle, const char* function_name) { return reinterpret_cast(dlsym(handle, function_name)); } - intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); - } - - uint32_t get_ggml_tensor_data_size(const ggml_tensor* tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = qnn_get_ggml_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); - } - - - // ================================================================================================= - // - // QNN backend internal helper functions - // - // ================================================================================================= - // TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT - const char* opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; - } - return nullptr; - } - inline int validate_tensor_version(Qnn_Tensor_t tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { QNN_LOG_WARN( @@ -272,6 +171,45 @@ namespace qnn { tensor.v1.memHandle = handle; } } + + +#if ENABLE_QNNBACKEND_PERF + class qnn_perf { + public: + qnn_perf(const std::string& perf_name) : _perf_name(std::move(perf_name)) {}; + qnn_perf() = delete; + qnn_perf(const qnn_perf&) = delete; + qnn_perf& operator= (const qnn_perf&) = delete; + + void start() { + _begin_time = ggml_time_us(); + } + + void info() { + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + } + + private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; + }; +#else + class qnn_perf { + public: + qnn_perf(const std::string& perf_name) {} + qnn_perf() = delete; + qnn_perf(const qnn_perf&) = delete; + qnn_perf& operator= (const qnn_perf&) = delete; + + void start() {} + void info() {} + }; +#endif + } diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index 77a2059ed0f0c..66e8c077a1d3a 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -21,6 +21,8 @@ set(SOURCE_FILES ../../ggml-backend.c ../../ggml-quants.c ../../ggml-qnn/logger.cpp + ../../ggml-qnn/utils.cpp + ../../ggml-qnn/backend-ops.cpp ../../ggml-qnn.cpp ggml-qnn-ut.cpp )