From b0c3013f2ea2c82a43248e43a0abfaebd5bb105a Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 24 Apr 2024 16:28:18 +0800 Subject: [PATCH 01/16] ggml: add Qualcomm QNN(Qualcomm Neural Network,aka Qualcomm AI Engine Direct) backend --- ggml-qnn.cpp | 4874 ++++++++++++++++++++++++++++++++++++++++++++++++++ ggml-qnn.h | 55 + ggml.c | 3 +- llama.cpp | 30 +- 4 files changed, 4960 insertions(+), 2 deletions(-) create mode 100644 ggml-qnn.cpp create mode 100644 ggml-qnn.h diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp new file mode 100644 index 0000000000000..5d698f184c25d --- /dev/null +++ b/ggml-qnn.cpp @@ -0,0 +1,4874 @@ +/* + * MIT license + * Copyright (C) 2024 GGML Authors + * SPDX-License-Identifier: MIT + * + * this is implementation of ggml QNN(Qualcomm Neural Network, aka AI Engine Direct) backend + * + * status: + * + * 1. core implementation(data path works fine as expected with whisper.cpp using QNN CPU/GPU backend on Qualcomm's SoC based low-end phone + * + * 2. core implementation(data path works fine as expected with whisper.cpp using QNN HTP(aka DSP) backend on Qualcomm's soC based high-end phone + * + * 3. core implementation(data path works fine as expected with llama.cpp using QNN CPU/GPU/HTP(aka DSP) backend on Qualcomm's soC based high-end phone + * + * 4. GGML_OP_MUL_MAT & GGML_OP_MUL & GGML_OP_ADD using QNN API has been completed + * + * todo: + * + * 1. lack of implementation of other GGML-OPs using QNN API + * + * 2. only support FP32 / FP16 and the input and output tensors must be of the same data type + * + * 3. QNN's RPC feature(which useful for QNN HTP(aka DSP) backend) not used + * + * 4. multi QNN backend(CPU/GPU/DSP) simultaneously not support + * + * 5. multithreading not work with QNN GPU/HTP(aka DSP) backend + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" + +#include "ggml-qnn.h" + +#include "ggml-backend-impl.h" + + +// ================================================================================================= +// +// forward/external/helper declaration +// +// ================================================================================================= +class qnn_instance; + +//TODO: should be removed because this is a workaround method during development stage +extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + +#if (defined __ANDROID__) || (defined ANDROID) //Qualcomm's QNN could running on Windows over ARM(aka WoA) +extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) +__attribute__((__format__(printf, 3, 4))); +#endif + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); + + + +// ================================================================================================= +// +// self-defined macro / data structure +// +// ================================================================================================= +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) + +#define GGML_QNN_LOGBUF_LEN 4096 +#define GGML_QNN_MAX_BUFFERS 128 +#define MATRIX_ROW_PADDING 512 + +#define BUF_MAJOR_MASK 0xFF000000 +#define BUF_CONTROL_BASE 0xEE000000 + +#define GGML_QNN_DEBUG 1 + +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGML_QNN_DEBUG +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) + +#define VALIDATE_OP_CONFIG_VERSION(op, err) VALIDATE(validate_opconfig_version(op), err) + +#define QNN_VER_PTR(x) (&((x).v1)) +#define QNN_OP_CFG_VALID(opConfig) ((opConfig).version == QNN_OPCONFIG_VERSION_1) + +#define QNN_OP_CFG_GET_NAME(opConfig) get_qnn_oponfig_name(opConfig) +#define QNN_OP_CFG_GET_PACKAGE_NAME(opConfig) get_qnn_opconfig_packagename(opConfig) +#define QNN_OP_CFG_GET_TYPE_NAME(opConfig) get_qnn_opconfig_typename(opConfig) +#define QNN_OP_CFG_GET_NUM_PARAMS(opConfig) get_qnn_opconfig_numparams(opConfig) +#define QNN_OP_CFG_GET_PARAMS(opConfig) get_qnn_opconfig_params(opConfig) +#define QNN_OP_CFG_GET_NUM_INPUTS(opConfig) get_qnn_opconfig_numinputs(opConfig) +#define QNN_OP_CFG_GET_INPUTS(opConfig) get_qnn_opconfig_inputs(opConfig) +#define QNN_OP_CFG_GET_NUM_OUTPUTS(opConfig) get_qnn_opconfig_numoutputs(opConfig) +#define QNN_OP_CFG_GET_OUTPUTS(opConfig) get_qnn_opconfig_outputs(opConfig) + +#define QNN_OP_CFG_SET_NAME(opConfig, value) set_qnn_opconfig_name(opConfig, value) +#define QNN_OP_CFG_SET_PACKAGE_NAME(opConfig, value) set_qnn_opconfig_packagename(opConfig, value) +#define QNN_OP_CFG_SET_TYPE_NAME(opConfig, value) set_qnn_opconfig_typename(opConfig, value) + +#define QNN_OP_CFG_SET_PARAMS(opConfig, numOfParams, params) \ + set_qnn_opconfig_params(opConfig, numOfParams, params) + +#define QNN_OP_CFG_SET_INPUTS(opConfig, numOfInputs, inputTensors) \ + set_qnn_opconfig_inputs(opConfig, numOfInputs, inputTensors) + +#define QNN_OP_CFG_SET_OUTPUTS(opConfig, numOfOutputs, outputTensors) \ + set_qnn_opconfig_outputs(opConfig, numOfOutputs, outputTensors) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_CLIENT_BUF(tensor) get_qnn_tensor_clientbuf(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) + + + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); + + + +typedef struct qnn_buf_s qnn_buf_t; +typedef struct qnn_buf_s qnn_buf_buffer_t; +typedef struct buf_element_s buf_element_t; +typedef void (*ggml_qnn_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_qnn_func_common_t)(const ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +enum class ggml_qnn_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 +}; + + +struct buf_element_s { + buf_element_t * next; + + unsigned char * mem; + unsigned char * content; /* start of raw content in mem */ + + uint32_t size ; /* size of content */ + int32_t max_size; /* size of pre-allocated memory pointed to by mem */ + uint32_t type; + void (*free_buffer) (buf_element_t * buf); + void * source; /* CPU, GPU, DSP, ... */ + int id; +} ; + + +struct qnn_buf_s { + buf_element_t * first, * last; + + size_t qnn_buf_size; + uint32_t qnn_buf_data_size; + void * qnn_buf_empty_cb_data; + const char * name; + + pthread_mutex_t mutex; + pthread_cond_t not_empty; + + void (*put) (qnn_buf_t * fifo, buf_element_t * buf); + + buf_element_t *(*get) (qnn_buf_t * fifo); + + void (*clear) (qnn_buf_t * fifo) ; + + int (*size) (qnn_buf_t * fifo); + + int (*num_free) (qnn_buf_t * fifo); + + uint32_t (*data_size) (qnn_buf_t * fifo); + + void (*destroy) (qnn_buf_t * fifo); + + buf_element_t * (*buffer_alloc) (qnn_buf_t * self); + + buf_element_t * (*buffer_try_alloc) (qnn_buf_t * self); + + buf_element_t * buffer_pool_top; + pthread_mutex_t buffer_pool_mutex; + pthread_cond_t buffer_pool_cond_not_empty; + int buffer_pool_num_free; + int buffer_pool_capacity; + int buffer_pool_buf_size; + void * buffer_pool_base; /* used to free mem pool */ +} ; + + +struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + qnn_buf_t * buffer_pool; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; +} ; + + +// ================================================================================================= +// +// static global variables +// +// ================================================================================================= +//TODO: should be removed for support multi QNN backend simultaneously +static ggml_backend_t g_qnn_backend = nullptr; + +//TODO: should be removed for support multi QNN backend simultaneously +static int g_current_device = 3; // 3 is the default ggml backend + +static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 }; +static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 }; +static void ggml_setup_op_has_task_pass(void) { + { // INIT + bool * p = GGML_OP_HAS_INIT; + + p[GGML_OP_ACC ] = true; + p[GGML_OP_MUL_MAT ] = true; + p[GGML_OP_MUL_MAT_ID ] = true; + p[GGML_OP_OUT_PROD ] = true; + p[GGML_OP_SET ] = true; + p[GGML_OP_GET_ROWS_BACK ] = true; + p[GGML_OP_DIAG_MASK_INF ] = true; + p[GGML_OP_DIAG_MASK_ZERO ] = true; + p[GGML_OP_CONV_TRANSPOSE_1D ] = true; + p[GGML_OP_CONV_TRANSPOSE_2D ] = true; + p[GGML_OP_FLASH_ATTN_BACK ] = true; + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + p[GGML_OP_ADD_REL_POS ] = true; + } + + { // FINALIZE + bool * p = GGML_OP_HAS_FINALIZE; + + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + } +} + + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently +static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { + [QNN_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, + [QNN_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, + [QNN_HTP] = {.device = 2, .threads = 1, .name = "qnn-htp(aka dsp)", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, +}; + + + +// ================================================================================================= +// +// internal helper functions +// +// ================================================================================================= +static inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, + tensor.version); + return 1; + } + return 0; +} + + +static inline int validate_opconfig_version(Qnn_OpConfig_t opConfig) { + if (opConfig.version != QNN_OPCONFIG_VERSION_1) { + QNN_LOG_WARN("validate_opconfig_version() op %s, got unsupported version %d\n", + opConfig.v1.name, + opConfig.version); + return 1; + } + return 0; +} + + +static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.name; + } + return nullptr; +} + + +static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * opConfig) { + return get_qnn_oponfig_name(*opConfig); +} + + +static inline const char * get_qnn_opconfig_packagename(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.packageName; + } + return nullptr; +} + + +static inline const char * get_qnn_opconfig_packagename(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_packagename(*opConfig); +} + + +static inline const char * get_qnn_opconfig_typename(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.typeName; + } + return nullptr; +} + + +static inline const char * get_qnn_opconfig_typename(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_typename(*opConfig); +} + + +static inline uint32_t get_qnn_opconfig_numparams(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.numOfParams; + } + return 0u; +} + + +static inline uint32_t get_qnn_opconfig_numparams(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_numparams(*opConfig); +} + + +static inline const Qnn_Param_t * get_qnn_opconfig_params(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.params; + } + return nullptr; +} + + +static inline const Qnn_Param_t * get_qnn_opconfig_params(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_params(*opConfig); +} + + +static inline uint32_t get_qnn_opconfig_numinputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.numOfInputs; + } + return 0u; +} + + +static inline uint32_t get_qnn_opconfig_numinputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_numinputs(*opConfig); +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_inputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.inputTensors; + } + return nullptr; +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_inputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_inputs(*opConfig); +} + + +static inline uint32_t get_qnn_opconfig_numoutputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.numOfOutputs; + } + return 0u; +} + + +static inline uint32_t get_qnn_opconfig_numoutputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_numoutputs(*opConfig); +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_outputs(const Qnn_OpConfig_t & opConfig) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + return opConfig.v1.outputTensors; + } + return nullptr; +} + + +static inline const Qnn_Tensor_t * get_qnn_opconfig_outputs(const Qnn_OpConfig_t * opConfig) { + return get_qnn_opconfig_outputs(*opConfig); +} + + +static inline void set_qnn_opconfig_name(Qnn_OpConfig_t & opConfig, const char * name) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.name = name; + } +} + + +static inline void set_qnn_opconfig_name(Qnn_OpConfig_t * opConfig, const char * name) { + set_qnn_opconfig_name(*opConfig, name); +} + + +static inline void set_qnn_opconfig_packagename(Qnn_OpConfig_t & opConfig, const char * packageName) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.packageName = packageName; + } +} + + +static inline void set_qnn_opconfig_packagename(Qnn_OpConfig_t * opConfig, const char * packageName) { + set_qnn_opconfig_packagename(*opConfig, packageName); +} + + +static inline void set_qnn_opconfig_typename(Qnn_OpConfig_t & opConfig, const char * typeName) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.typeName = typeName; + } +} + + +static inline void set_qnn_opconfig_typename(Qnn_OpConfig_t * opConfig, const char * typeName) { + set_qnn_opconfig_typename(*opConfig, typeName); +} + + +static inline void set_qnn_opconfig_params(Qnn_OpConfig_t & opConfig, + uint32_t numOfParams, + Qnn_Param_t * params) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.numOfParams = numOfParams; + opConfig.v1.params = params; + } +} + + +static inline void set_qnn_opconfig_params(Qnn_OpConfig_t * opConfig, + uint32_t numOfParams, + Qnn_Param_t * params) { + set_qnn_opconfig_params(*opConfig, numOfParams, params); +} + + +static inline void set_qnn_opconfig_inputs(Qnn_OpConfig_t & opConfig, + uint32_t numOfInputs, + Qnn_Tensor_t * inputTensors) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.numOfInputs = numOfInputs; + opConfig.v1.inputTensors = inputTensors; + } +} + + +static inline void set_qnn_opconfig_inputs(Qnn_OpConfig_t * opConfig, + uint32_t numOfInputs, + Qnn_Tensor_t * inputTensors) { + set_qnn_opconfig_inputs(*opConfig, numOfInputs, inputTensors); +} + + +static inline void set_qnn_opconfig_outputs(Qnn_OpConfig_t & opConfig, + uint32_t numOfOutputs, + Qnn_Tensor_t * outputTensors) { + if (opConfig.version == QNN_OPCONFIG_VERSION_1) { + opConfig.v1.numOfOutputs = numOfOutputs; + opConfig.v1.outputTensors = outputTensors; + } +} + + +static inline void set_qnn_opconfig_outputs(Qnn_OpConfig_t * opConfig, + uint32_t numOfOutputs, + Qnn_Tensor_t * outputTensors) { + set_qnn_opconfig_outputs(*opConfig, numOfOutputs, outputTensors); +} + + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + return 0u; +} + + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) { return get_qnn_tensorid(*tensor); } + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) { + return get_qnn_tensorname(*tensor); +} + + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensortype(*tensor); +} + + +static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + + +static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dataformat(*tensor); +} + + +static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + + +static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_datatype(*tensor); +} + + +static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + + +static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_quantparams(*tensor); +} + + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) { return get_qnn_tensor_rank(*tensor); } + + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dimensions(*tensor); +} + + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memtype(*tensor); +} + + +static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.clientBuf; + } + return QNN_CLIENT_BUFFER_INIT; +} + + +static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_clientbuf(*tensor); +} + + +static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memHandle; + } + return nullptr; +} + + +static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memhandle(*tensor); +} + + +static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + + +static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) { set_qnn_tensor_id(*tensor, id); } + + +static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + + +static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) { + set_qnn_tensor_name(*tensor, name); +} + + +static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } +} + + +static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) { + set_qnn_tensor_type(*tensor, type); +} + + +static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } +} + + +static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) { + set_qnn_tensor_dataformat(*tensor, format); +} + + +static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } +} + + +static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) { + set_qnn_tensor_datatype(*tensor, dataType); +} + + +static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } +} + + +static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) { + set_qnn_tensor_quantparams(*tensor, params); +} + + +static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } +} + + +static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) { + set_qnn_tensor_rank(*tensor, rank); +} + + +static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } +} + + +static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) { + set_qnn_tensor_dimensions(*tensor, dims); +} + + +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = memType; + } +} + + +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) { + set_qnn_tensor_memtype(*tensor, memType); +} + + +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = clientBuf; + } +} + + +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) { + set_qnn_tensor_clientbuf(*tensor, clientBuf); +} + + +static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} + + +static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) { + set_qnn_tensor_memhandle(*tensor, handle); +} + + + +static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { + if (!dst || !src || !dstSize || !copySize) + return 0; + + size_t minSize = dstSize < copySize ? dstSize : copySize; + + memcpy(dst, src, minSize); + + return minSize; +} + + +static char * ggml_qnn_strndup(const char * source, size_t maxlen) { + return ::strndup(source, maxlen); +} + + +static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { + int err = 0; + VALIDATE_TENSOR_VERSION(src, err); + + dst.version = src.version; + QNN_TENSOR_SET_NAME( + dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (QNN_TENSOR_GET_NAME(dst) == nullptr) { + return 1; + } + QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); + QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); + QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); + QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); + QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); + + // Only metadata (i.e. non-static data) is copied from source to destination. The union still + // must be initialized so that the clientBuf/memHandle do not contain garbage data + if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t clientBuf = {nullptr, 0}; + QNN_TENSOR_SET_CLIENT_BUF(dst, clientBuf); + } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); + } else { + return 1; + } + + Qnn_QuantizeParams_t srcQParam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = srcQParam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t srcQParamCpy = srcQParam; + Qnn_AxisScaleOffset_t &axisScaleOffset = srcQParamCpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axisScaleOffset.scaleOffset; + size_t scaleOffsetSize = axisScaleOffset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, + scaleOffsetSize, + srcQParam.axisScaleOffsetEncoding.scaleOffset, + scaleOffsetSize); + QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParamCpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t srcQParamCpy = srcQParam; + Qnn_BwAxisScaleOffset_t &bwAxisScaleOffset = srcQParamCpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwAxisScaleOffset.numElements * sizeof(float); + float **scales = &bwAxisScaleOffset.scales; + int32_t **offsets = &bwAxisScaleOffset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, srcQParam.bwAxisScaleOffsetEncoding.scales, scaleSize); + + // Only copy offsets if present, nullptr implies all offsets are 0 + if (bwAxisScaleOffset.offsets != nullptr) { + size_t offsetSize = bwAxisScaleOffset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, srcQParam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + } + QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParamCpy); + } else { + QNN_TENSOR_SET_QUANT_PARAMS(dst, srcQParam); + } + + // need to allocate and copy memory for all the pointer members + uint32_t rank = QNN_TENSOR_GET_RANK(src); + QNN_TENSOR_SET_RANK(dst, rank); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *)malloc(dim_size); + if (dimensions == nullptr) { + QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src)); + return 1; + } + memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); + QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); + + return err; +} + + +static int free_qnn_tensor(Qnn_Tensor_t & tensor) { + int err = 0; + VALIDATE_TENSOR_VERSION(tensor, err); + + if (nullptr == QNN_TENSOR_GET_NAME(tensor)) { + QNN_LOG_INFO("it should not happen, pls check"); + } else { + //QNN_LOG_DEBUG("QNN tensor name %s", QNN_TENSOR_GET_NAME(tensor)); + free((void *) QNN_TENSOR_GET_NAME(tensor)); + } + if (nullptr == QNN_TENSOR_GET_DIMENSIONS(tensor)) { + QNN_LOG_INFO("it should not happen, pls check"); + } else { + //TODO:why crash in here? why pointer changed with mul_mat? + //memory leak after comment above line + //free(QNN_TENSOR_GET_DIMENSIONS(tensor)); + } + + return err; +} + + +static int free_qnn_tensors(Qnn_Tensor_t *& tensors, uint32_t numTensors) { + int err = 0; + + // free all pointer allocations in struct + for (size_t i = 0; i < numTensors; i++) { + free_qnn_tensor(tensors[i]); + } + free(tensors); + + return err; +} + + +static float ggml_tensor_sum_elements(const ggml_tensor * tensor) { + double sum = 0; + float value = 0; + std::ostringstream tmposs; + if (tensor->type == GGML_TYPE_F32) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + j * tensor->ne[0] + k]; + sum += value; + //QNN_LOG_DEBUG("[%d][%d][%d][%d]%.2f \t", h, i, j, k, value); + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << "\t"; + } + if (strlen(tmposs.str().c_str()) > 4000) { + + } else { + QNN_LOG_DEBUG("%s", tmposs.str().c_str()); + } + tmposs.clear(); + tmposs.str(""); + QNN_LOG_DEBUG("\n"); + } + } + } + } + QNN_LOG_DEBUG("\n"); + return sum; +} + + +static void ggml_dump_tensor(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s\n", name); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); + float sum = ggml_tensor_sum_elements(tensor); + + //QNN_LOG_DEBUG("\n"); + //QNN_LOG_DEBUG("Sum of tensor %s is %6.2f\n", name, sum); +} + + +static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + + +//TODO: +//ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_UFIXED_POINT_4; + case GGML_TYPE_Q4_1: + return QNN_DATATYPE_SFIXED_POINT_4; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_UFIXED_POINT_8; + case GGML_TYPE_Q8_1: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + + } + return QNN_DATATYPE_FLOAT_32; +} + + +//TODO: +static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + } + + return nullptr; +} + + +static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = ggml_get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); +} + + +template +Fn load_qnn_functionpointers(void * handle, const char * function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} + + +static void qnn_xfree(void * ptr) { + if (nullptr != ptr) { + free(ptr); + ptr = nullptr; + } +} + + +static void * qnn_xmalloc(size_t size) { + void * ptr; + + if (!size) + size++; + + if ((ptr = calloc(1, size)) == nullptr) { + QNN_LOG_WARN("malloc(%d) failed: %s\n",size, strerror(errno)); + return nullptr; + } + + return ptr; +} + + +static void * qnn_xmalloc_aligned(size_t alignment, size_t size, void ** base) { + char * ptr; + + *base = ptr = static_cast(qnn_xmalloc(size + alignment)); + + while ((size_t) ptr % alignment) + ptr++; + + return ptr; +} + + +static void buffer_pool_free (buf_element_t * element) { + qnn_buf_t * self = (qnn_buf_t *) element->source; + + pthread_mutex_lock(&self->buffer_pool_mutex); + + element->next = self->buffer_pool_top; + self->buffer_pool_top = element; + + self->buffer_pool_num_free++; + if (self->buffer_pool_num_free > self->buffer_pool_capacity) { + QNN_LOG_DEBUG("TOO MANY FREE\n"); + } + + pthread_cond_signal (&self->buffer_pool_cond_not_empty); + + pthread_mutex_unlock (&self->buffer_pool_mutex); +} + + +static buf_element_t * buffer_pool_alloc (qnn_buf_t * self) { + buf_element_t * buf = nullptr; + int i; + + pthread_mutex_lock (&self->buffer_pool_mutex); + + while (self->buffer_pool_num_free < 2) { + pthread_cond_wait (&self->buffer_pool_cond_not_empty, &self->buffer_pool_mutex); + } + + buf = self->buffer_pool_top; + self->buffer_pool_top = self->buffer_pool_top->next; + self->buffer_pool_num_free--; + + buf->content = buf->mem; + buf->size = 0; + buf->type = 0; + + pthread_mutex_unlock (&self->buffer_pool_mutex); + + return buf; +} + + +static buf_element_t * buffer_pool_try_alloc (qnn_buf_t * self) { + buf_element_t * buf = nullptr; + + pthread_mutex_lock (&self->buffer_pool_mutex); + + if (self->buffer_pool_top) { + buf = self->buffer_pool_top; + self->buffer_pool_top = self->buffer_pool_top->next; + self->buffer_pool_num_free--; + } else { + buf = nullptr; + } + + pthread_mutex_unlock (&self->buffer_pool_mutex); + + if (buf) { + buf->content = buf->mem; + buf->size = 0; + } + + return buf; +} + + +static void qnn_buf_buffer_put(qnn_buf_t * fifo, buf_element_t * element) { + pthread_mutex_lock (&fifo->mutex); + + if (fifo->last) + fifo->last->next = element; + else + fifo->first = element; + + fifo->last = element; + element->next = nullptr; + fifo->qnn_buf_size++; + fifo->qnn_buf_data_size += element->size; + + LOGJ("put:index %d, fifo->size is %d, self->buffer_pool_num_free %d\n", element->id, fifo->qnn_buf_size, fifo->buffer_pool_num_free); + pthread_cond_signal (&fifo->not_empty); + + pthread_mutex_unlock (&fifo->mutex); +} + + +static buf_element_t * qnn_buf_buffer_get (qnn_buf_t * fifo) { + buf_element_t * buf = nullptr; + + pthread_mutex_lock (&fifo->mutex); +#if 0 + while (fifo->first == nullptr) { + pthread_cond_wait (&fifo->not_empty, &fifo->mutex); + } +#else + if (fifo->first == nullptr) { + pthread_mutex_unlock (&fifo->mutex); + return nullptr; + } +#endif + + buf = fifo->first; + + fifo->first = fifo->first->next; + if (fifo->first==nullptr) + fifo->last = nullptr; + + fifo->qnn_buf_size--; + fifo->qnn_buf_data_size -= buf->size; + + pthread_mutex_unlock (&fifo->mutex); + + return buf; +} + + +static void qnn_buf_buffer_clear (qnn_buf_t * fifo) { + buf_element_t * buf, * next, * prev; + + pthread_mutex_lock (&fifo->mutex); + + buf = fifo->first; + prev = nullptr; + + while (buf != nullptr) { + next = buf->next; + if ((buf->type & BUF_MAJOR_MASK) != BUF_CONTROL_BASE) { + if (prev) + prev->next = next; + else + fifo->first = next; + + if (!next) + fifo->last = prev; + + fifo->qnn_buf_size--; + fifo->qnn_buf_data_size -= buf->size; + + buf->free_buffer(buf); + } else { + prev = buf; + } + + buf = next; + } + + QNN_LOG_DEBUG("free buffers after clear: %d\n", fifo->buffer_pool_num_free); + pthread_mutex_unlock (&fifo->mutex); +} + + +static int qnn_buf_buffer_size (qnn_buf_t * self) { + int size = 0; + + pthread_mutex_lock(&self->mutex); + size = self->qnn_buf_size; + pthread_mutex_unlock(&self->mutex); + + return size; +} + + +static uint32_t qnn_buf_buffer_data_size (qnn_buf_t * self) { + uint32_t data_size; + + pthread_mutex_lock(&self->mutex); + data_size = self->qnn_buf_data_size; + pthread_mutex_unlock(&self->mutex); + + return data_size; +} + + +static int qnn_buf_buffer_num_free (qnn_buf_t * self) { + int buffer_pool_num_free = 0; + + pthread_mutex_lock(&self->mutex); + buffer_pool_num_free = self->buffer_pool_num_free; + pthread_mutex_unlock(&self->mutex); + + return buffer_pool_num_free; +} + + +static void qnn_buf_buffer_dispose (qnn_buf_t * self) { + buf_element_t * buf, * next; + int received = 0; + + self->clear( self ); + buf = self->buffer_pool_top; + + while (buf != nullptr) { + next = buf->next; + qnn_xfree(buf); + received++; + + buf = next; + } + + while (received < self->buffer_pool_capacity) { + buf = self->get(self); + qnn_xfree(buf); + received++; + } + + qnn_xfree(self->buffer_pool_base); + pthread_mutex_destroy(&self->mutex); + pthread_cond_destroy(&self->not_empty); + pthread_mutex_destroy(&self->buffer_pool_mutex); + pthread_cond_destroy(&self->buffer_pool_cond_not_empty); + qnn_xfree((void *)self->name); + qnn_xfree (self); +} + + +static qnn_buf_t * qnn_buf_new(const char * name, int num_buffers, uint32_t buf_size) { + int i = 0; + int alignment = 4; + qnn_buf_t * self = nullptr; + uint8_t * multi_buffer = nullptr; + + self = (qnn_buf_t*)qnn_xmalloc(sizeof(qnn_buf_t)); + if (nullptr == self) { + QNN_LOG_WARN("malloc memory failed\n"); + return nullptr; + } + + self->name = strdup(name); + self->first = nullptr; + self->last = nullptr; + self->qnn_buf_size = 0; + self->put = qnn_buf_buffer_put; + self->get = qnn_buf_buffer_get; + self->clear = qnn_buf_buffer_clear; + self->size = qnn_buf_buffer_size; + self->num_free = qnn_buf_buffer_num_free; + self->data_size = qnn_buf_buffer_data_size; + self->destroy = qnn_buf_buffer_dispose; + pthread_mutex_init (&self->mutex, nullptr); + pthread_cond_init (&self->not_empty, nullptr); + + + if (buf_size % alignment != 0) + buf_size += alignment - (buf_size % alignment); + + QNN_LOG_INFO("[%s]allocating %d Mbytes memory(alignment = %d)\n", name, (num_buffers * buf_size) / (1 << 20), alignment); + + multi_buffer = (uint8_t *)qnn_xmalloc_aligned (alignment, num_buffers * buf_size, &self->buffer_pool_base); + if (nullptr == multi_buffer) { + QNN_LOG_WARN("malloc memory failed\n"); + free(self); + return nullptr; + } + + self->buffer_pool_top = nullptr; + + pthread_mutex_init (&self->buffer_pool_mutex, nullptr); + pthread_cond_init (&self->buffer_pool_cond_not_empty, nullptr); + + self->buffer_pool_num_free = 0; + self->buffer_pool_capacity = num_buffers; + self->buffer_pool_buf_size = buf_size; + self->buffer_alloc = buffer_pool_alloc; + self->buffer_try_alloc = buffer_pool_try_alloc; + + for (i = 0; i < num_buffers; i++) { + buf_element_t * buf = nullptr; + + buf = (buf_element_t *)qnn_xmalloc(sizeof (buf_element_t)); + if (nullptr == buf) { + QNN_LOG_WARN("malloc memory failed"); + free(multi_buffer); + free(self); + return nullptr; + } + + buf->id = i; + buf->mem = multi_buffer; + multi_buffer += buf_size; + + buf->max_size = buf_size; + buf->free_buffer = buffer_pool_free; + buf->source = self; + + buffer_pool_free(buf); + } + + return self; +} + + +static const char * get_qnn_backend_name(int n_backend_type) { + switch (n_backend_type) { + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-HTP(DSP)"; + case 3: + return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + +#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/HTP(aka DSP) backend currently + case 3: + return "QNN-cDSP"; + case 4: + return "QNN-HTA"; +#endif + + default: + return "unknown"; + } +} + + +static intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 ? offset + : offset + + (static_cast(alignment) - + offset % static_cast(alignment)); +} + + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggml_qnn_log_internal_mutex; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(ggml_qnn_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + __android_log_print(level, "llamacpp", "%s", s_ggml_qnn_log_internal_buf); +#else + printf("%s", buffer); //Qualcomm's QNN could running on Window over ARM +#endif + } + va_end(args); + } +} + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// ================================================================================================= +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + +public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + + void set_qnn_interface(const QnnInterface_t * qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { + return _qnn_interface->backendId; + } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + +private: + const QnnInterface_t *_qnn_interface = nullptr; + + const QnnSystemInterface_t *_qnn_sys_interface = nullptr; +}; + + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// and +// +// resource management of QNN resources for GGML's QNN backend +// ================================================================================================= +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, + const std::string & model_name) : + _lib_path(std::move(lib_path)), + _backend_name(std::move(backend_name)), + _model_name(std::move(model_name)) {}; + + ~qnn_instance() { + } + + int qnn_init(const QnnSaver_Config_t ** saver_config); + + int qnn_finalize(); + + const qnn_interface &get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + + const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + + int init_qnn_graph(const char * graph_name, + bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr + ); + + int finalize_qnn_graph(); + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + + QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + + int set_rpc_polling() { + if (_qnn_rpc_pollingtime > 0) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; + memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); + rpc_pollingTime.option = + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&rpc_pollingTime, nullptr}; + if (_qnn_htp_perfinfra) { + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + } + } + return 0; + } + + + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_DEBUG("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; + memset(&powerConfig, 0, sizeof(powerConfig)); + powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + powerConfig.dcvsV3Config.dcvsEnable = 0; + powerConfig.dcvsV3Config.setDcvsEnable = 1; + powerConfig.dcvsV3Config.contextId = _qnn_power_configid; + powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + powerConfig.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False + powerConfig.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False + powerConfig.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False + powerConfig.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable + powerConfig.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False + // set Sleep latency parameter + uint32_t latencyValue = 40; + powerConfig.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec + // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&powerConfig, nullptr}; + + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + + return 0; + } + + std::string &get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { + return _rpcmem_initialized; + } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + int32_t rpcmem_to_fd(void * buf); + + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); + + void unregister_rpcmem(); + + void *alloc_rpcmem(size_t bytes, size_t alignment); + + void free_rpcmem(void * buf); + + bool is_rpcmem_allocated(void * buf); + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + +public: + std::map> _qnn_graph_map; + +private: + int load_system(); + + int unload_system(); + + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** saver_config); + + int unload_backend(); + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + +private: + static constexpr const int _required_num_providers = 1; + +private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used in currently + BackendIdType _backend_id; + + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; + + qnn_interface _qnn_interface; + + void *_system_lib_handle = nullptr; + void *_model_lib_handle = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_set _qnn_mem_set; + + static std::mutex _init_mutex; + static std::unordered_map _loaded_lib_handle; + static std::unordered_map _lib_path_to_backend_id; + static std::unordered_map _loaded_backend; + + void *_rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + + + std::string _graph_name; +}; + + + +// ================================================================================================= +// +// implementation of wrapper class +// +// ================================================================================================= +std::mutex qnn_instance::_init_mutex; + +std::unordered_map qnn_instance::_loaded_lib_handle; + +std::unordered_map qnn_instance::_lib_path_to_backend_id; + +std::unordered_map qnn_instance::_loaded_backend; + + +void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(align_to(alignment, + reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + + return aligned_buf; +} + + +void qnn_instance::free_rpcmem(void * buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } +} + + +int32_t qnn_instance::rpcmem_to_fd(void *buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; +} + + +int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + //return 3; + } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register( + _qnn_context_handle, + &descriptor, + /*numDescriptors=*/1, + &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), + strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert(handle); + + return 0; +} + + +void qnn_instance::unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); + } + + for (auto &mem_handle : _qnn_mem_set) { + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + } + } + _qnn_mem_set.clear(); +} + + +bool qnn_instance::is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; +} + + +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + // load get_provider function + auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, + "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + // get QnnInterface Providers + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", + lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + auto saver_initialize = load_qnn_functionpointers<_pfn_QnnSaver_initialize *>( + _loaded_lib_handle[backend_id], "QnnSaver_initialize"); + if (nullptr != saver_initialize) { + error = saver_initialize(saver_config); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); + return 7; + } + } else { + QNN_LOG_WARN("saver_initialize is null\n"); + } + + return 0; +} + + +int qnn_instance::unload_backend() { + int dlclose_error = 0; + for (auto &it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; +} + + +int qnn_instance::load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not pen QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; + } + + auto *get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + _system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + LOGW("can not create QNN system contenxt\n"); + } else { + QNN_LOG_DEBUG("initialize qnn system successfully\n"); + } + + return 0; +} + + +int qnn_instance::unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + QNN_LOG_DEBUG("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return 0; +} + + +static void ggml_qnn_logcallback(const char * fmt, + QnnLog_Level_t level, + uint64_t timestamp, + va_list argp) { + + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; + + const char * levelStr = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + levelStr = " ERROR "; + break; + case QNN_LOG_LEVEL_WARN: + levelStr = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + levelStr = " INFO "; + break; + case QNN_LOG_LEVEL_DEBUG: + levelStr = " DEBUG "; + break; + case QNN_LOG_LEVEL_VERBOSE: + levelStr = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + levelStr = "UNKNOWN"; + break; + } + + double ms = (double) timestamp / 1000000.0; + + { + std::lock_guard lock(log_mutex); + + int len_content = 0; + memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); + len_content = vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); + //QNN_LOG_DEBUG("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf); + } +} + + +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); + + const std::lock_guard lock(_init_mutex); + + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string bakend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) { + int is_load_ok = load_backend(bakend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[bakend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n", + bakend_lib_path.c_str(), + _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + +#if 1 + _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#else + _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#endif + if (nullptr == _qnn_log_handle) { + QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create(_qnn_log_handle, temp_backend_config.empty() ? nullptr + : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnStatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + auto qnnStatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + if (QNN_SUCCESS != qnnStatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnStatus) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create device successfully\n"); + } + + /* + std::vector temp_device_config; + _qnn_interface.qnn_device_create(_qnn_log_handle, temp_device_config.empty() ? nullptr : temp_device_config.data(), &_qnn_device_handle); + if (nullptr == _qnn_device_handle) { + QNN_LOG_WARN("why failed to initialize qnn device\n"); + //return 6; + } + */ + + if (ggml_qnn_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (ggml_qnn_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 9; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free + || nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 10; + } + + if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr + : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 8; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; +} + + +//QNN SDK would/might/should release all allocated resource in SDK's internal +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; +} + + +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, graph_configs, + &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; +} + + +int qnn_instance::finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, _qnn_profile_handle, nullptr) != + QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + //return 1; + } + } else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; +} + + + +// ================================================================================================= +// +// implementation of GGML's QNN backend +// +// ================================================================================================= +static bool ggml_qnn_can_handle_op(const struct ggml_tensor * src0, const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + //double check + bool supported_op = ((dst->op == GGML_OP_ADD) || (dst->op == GGML_OP_MUL) || (dst->op == GGML_OP_MUL_MAT)); + if (!supported_op) { + QNN_LOG_DEBUG("op %d(%s)not support", dst->op, ggml_op_name(dst->op)); + return false; + } + + + //make QNN SDK happy + if (dst->op == GGML_OP_ADD) { + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && + (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && + (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)) && + (src0->rank == src1->rank); + + } + + if (dst->op == GGML_OP_MUL_MAT) { +#if 1 // log output have significant effect to performance but useful during development stage + QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); +#endif + } + + //make QNN SDK happy + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && + (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && + (src0->type == src1->type) && (src0->type == dst->type) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)); + + +} + + +static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT; + Qnn_OpConfig_t qnn_opconfig = QNN_OPCONFIG_INIT; + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); +#if 0 //it works fine with whisper.cpp and llama.cpp. comment them because focus on mulmat in llama.cpp inference since 04-23-2024 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + //QnnGraph_Config_t graph_config; + //graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + //graph_config.customConfig = strdup(graph_name.c_str()); + //const QnnGraph_Config_t * p_graph_config = &graph_config; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t opconfig = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, opconfig); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + //comment them because focus on mulmat in llama.cpp inference since 04-23-2024 + //QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + //comment them because focus on mulmat in llama.cpp inference since 04-23-2024 + //QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_durtion); + //QNN_LOG_DEBUG("call %s done\n", __func__); +} + + + +/* + * ggml_qnn_mul_mat was re-added as a standalone function because + * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 + * MUL_MAT take most of the compute time (about 95%). So to speed up llama, we have to focus on MUL_MAT. + * We have three kinds of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f16_f32: src0 is F16 and src1 is F32. + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. +*/ + +static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT; + Qnn_OpConfig_t qnn_opconfig = QNN_OPCONFIG_INIT; + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t opconfig = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, opconfig); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", n_durtion); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +//common function for GGML OPs using QNN API +static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string qnn_graph_name = "ggml_qnn_graph"; + std::string qnn_opconfig_name = "ggml_qnn_opconfig"; + const char * qnn_op_name = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT; + Qnn_OpConfig_t qnn_opconfig = QNN_OPCONFIG_INIT; + Qnn_Param_t qnn_params[] = {}; + + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + if (nullptr == qnn_op_name) { + QNN_LOG_WARN("pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, ggml_op_name(ggmlop)); + return; + } + + n_begin_time = ggml_time_us(); + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->rank, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->rank, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->rank, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + if (!graph_initialized) { + qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + qnn_opconfig_name = qnn_opconfig_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); + QNN_LOG_DEBUG("qnn opconfig name %s", qnn_opconfig_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); + return; + } + + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t opconfig = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + qnn_opconfig_name.c_str(), + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, opconfig); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", ggml_op_name(ggmlop), n_durtion); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + + + + +static void ggml_qnn_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_qnn_cpy(src0, dst, nullptr); + (void) src1; +} + + +static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); + +} + + +static void ggml_qnn_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + ggml_qnn_func_t func = nullptr; + ggml_qnn_func_common_t func_common = nullptr; + + bool supported_op = false; + + bool use_hwaccel = false; + + //begin sanity check + if (nullptr == g_qnn_backend) { + QNN_LOG_ERROR("pls check why qnn subsystem not initialized"); + return false; + } + + //this is special scenario for UT function qnn_ggml_op + //borrow some advantages from PyTorch:the user or the upper layer codes could specify whether a GGML OP(such as add/mul/mulmat) is accelerated by a specify backend) + //otherwise ggml-qnn.cpp don't known whether current caller is whisper.cpp or other scenario(for example, JNI function...) + + //in the all, use_hwaccel is different with supported_op + //this feature is heavily depend on PR in upstream whisper.cpp https://github.com/ggerganov/whisper.cpp/pull/2073 + use_hwaccel = (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU); + + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + //supported_op = (tensor->op == GGML_OP_ADD); //works very good with whisper.cpp(asr result is correct) + + if ((!use_hwaccel) && (!supported_op)) { + //TODO: should be removed because this is a workaround method during development stage + ggml_compute_forward(params, tensor); + return false; + } + + if ((!use_hwaccel) && (!ggml_qnn_can_handle_op(tensor->src[0], tensor->src[1], tensor))) { + //TODO: should be removed because this is a workaround method during development stage + ggml_compute_forward(params, tensor); + return false; + } + //end sanity check + + switch (tensor->op) { + case GGML_OP_ADD: + func = ggml_qnn_add; + //func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL: + func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL_MAT: + func = ggml_qnn_mul_mat; + //func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_REPEAT: + func = ggml_qnn_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_qnn_get_rows; + break; + case GGML_OP_DUP: + func = ggml_qnn_dup; + break; + + case GGML_OP_ACC: + func = ggml_qnn_acc; + break; + + case GGML_OP_DIV: + func = ggml_qnn_div; + break; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_qnn_gelu; + break; + case GGML_UNARY_OP_SILU: + func = ggml_qnn_silu; + break; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_qnn_gelu_quick; + break; + case GGML_UNARY_OP_TANH: + func = ggml_qnn_tanh; + break; + case GGML_UNARY_OP_RELU: + func = ggml_qnn_relu; + break; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_qnn_hardsigmoid; + break; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_qnn_hardswish; + break; + default: + return false; + } + break; + case GGML_OP_NORM: + func = ggml_qnn_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_qnn_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_qnn_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_qnn_upscale; + break; + case GGML_OP_PAD: + func = ggml_qnn_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_qnn_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_qnn_rms_norm; + break; + + case GGML_OP_MUL_MAT_ID: + func = ggml_qnn_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_qnn_scale; + break; + case GGML_OP_SQR: + func = ggml_qnn_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_qnn_clamp; + break; + case GGML_OP_CPY: + func = ggml_qnn_cpy; + break; + case GGML_OP_CONT: + func = ggml_qnn_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_qnn_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_qnn_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_qnn_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_qnn_rope; + break; + case GGML_OP_ALIBI: + func = ggml_qnn_alibi; + break; + case GGML_OP_IM2COL: + func = ggml_qnn_im2col; + break; + case GGML_OP_POOL_2D: + func = ggml_qnn_pool2d; + break; + case GGML_OP_SUM_ROWS: + func = ggml_qnn_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_qnn_argsort; + break; + default: + return false; + } + + + //ok, real show time in Qualcomm's QNN internal + if (nullptr != func) + func(tensor->src[0], tensor->src[1], tensor); + if (nullptr != func_common) + func_common(tensor->op, tensor->src[0], tensor->src[1], tensor); + + return true; +} + + +struct ggml_backend_qnn_buffer_context { + ~ggml_backend_qnn_buffer_context() { + if (buffer) { + free(buffer); + } + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } + + for (auto * qnn_tensor : qnn_tensors) { + free_qnn_tensor(*qnn_tensor); + free(qnn_tensor); + } + + std::map>::iterator graph_it; + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) g_qnn_backend->context; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->instance->get_qnn_raw_interface(); + for (graph_it = backend_ctx->instance->_qnn_graph_map.begin(); graph_it != backend_ctx->instance->_qnn_graph_map.end(); graph_it++) { + auto & graph_item = graph_it->second; + Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); + QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + } + backend_ctx->instance->_qnn_graph_map.clear(); + + sub_buffers.clear(); + qnn_tensors.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_qnn_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; + std::vector qnn_tensors; +}; + +static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "QNN"; +} + + +GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; +} + + +static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + delete ctx; +} + + +//TODO:not used +static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + return ctx->buffer; +} + + +static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + + /* + if (tensor->view_src != nullptr && tensor->view_offs == 0) { + assert(tensor->view_src->buffer->buft == buffer->buft); + tensor->backend = tensor->view_src->backend; + tensor->extra = tensor->view_src->extra; + return; + } + */ + + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + //TODO:only support FP32 & FP16 + Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + + if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + Qnn_Tensor_t qnn_tensor = { + .version= QNN_TENSOR_VERSION_1, + {.v1= { + .id=0, + .name= tensor->name, + .type= qnn_tensor_type, + .dataFormat= QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType= qnn_data_type, + .quantizeParams= {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding= {.scale= 0.0000000000000000f, .offset= 0}}}, + .rank= ggml_get_tensor_rank(tensor), + .dimensions=dimensions, + .memType= QNN_TENSORMEMTYPE_RAW, + {.clientBuf= {.data=nullptr, + .dataSize=0}}}} + }; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)malloc(sizeof(Qnn_Tensor_t)); + if (nullptr == p_qnn_tensor) { + QNN_LOG_WARN("init tensor failed"); + return; + } + Qnn_Tensor_t tensor_copy; + error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); + if (error != QNN_SUCCESS) { + free(p_qnn_tensor); + QNN_LOG_DEBUG("init tensor failed"); + return; + } + tensor->extra = p_qnn_tensor; + ctx->qnn_tensors.push_back(p_qnn_tensor); + + if (ggml_is_quantized(tensor->type)) { + //TODO + QNN_LOG_DEBUG("is quantized"); + } +} + + +static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); +} + + +static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *)tensor->data + offset, size); +} + + +static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + + +static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + memset(ctx->buffer, value, ctx->buffer_size); +} + + + +static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + free(sub_buffer); + } + ctx->sub_buffers.clear(); +} + + +static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, +}; + + +static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "QNN"; +} + + +static void * ggml_qnn_host_malloc(size_t n) { + void * data = nullptr; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); + return nullptr; + } + + return data; +} + + +static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + //TODO:use pre-allocated buffer in internal memory pool + ctx->buffer = ggml_qnn_host_malloc(size_aligned); + ctx->buffer_size = size_aligned; + + ctx->backend_ctx = &g_qnn_mgr[g_current_device]; + + if (nullptr == ctx->buffer) { + QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); +} + + +static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return 32; +} + + +//TODO: this value is an experimental value +static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + + return (38 * 1024 * 1024); +} + + +static bool ggml_backend_qnn_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, + ggml_backend_t backend) { + GGML_UNUSED(buft); + + return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); +} + + +// attention here because Qualcomm's QNN SDK is a highly well-designed SDK +// +// refer to https://developer.qualcomm.com/sites/default/files/attachments/qnn_software_stack.png +// https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html +static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return true; +} + +static ggml_backend_buffer_type_i ggml_backend_qnn_buffer_type_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host +}; + + +static const char * ggml_backend_qnn_name(ggml_backend_t backend) { + return "QNN"; +} + + +static void ggml_backend_qnn_free(ggml_backend_t backend) { + QNN_LOG_INFO("enter %s", __func__ ); + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + + qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance; + if (instance != nullptr) { + instance->qnn_finalize(); + delete instance; + g_qnn_mgr[ctx->device].instance = nullptr; + } + + qnn_buf_t * buffer_pool = (qnn_buf_t*)g_qnn_mgr[ctx->device].buffer_pool; + if (buffer_pool != nullptr) { + buffer_pool->destroy(buffer_pool); + g_qnn_mgr[ctx->device].buffer_pool = nullptr; + } + + if (g_qnn_mgr[ctx->device].backend != nullptr) { + delete backend; + g_qnn_backend = nullptr; + g_qnn_mgr[ctx->device].backend = nullptr; + } + QNN_LOG_INFO("leave %s", __func__ ); +} + + +static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + return ggml_backend_qnn_buffer_type(ctx->device); +} + + +#if 0 +static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_TANH: + return true; + default: + return false; + } + break; + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: { + struct ggml_tensor *a; + struct ggml_tensor *b; + if (op->op == GGML_OP_MUL_MAT) { + a = op->src[0]; + b = op->src[1]; + } else { + a = op->src[2]; + b = op->src[1]; + } + if (a->ne[3] != b->ne[3]) { + return false; + } + ggml_type a_type = a->type; + if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S || + a_type == GGML_TYPE_IQ4_XS) { + return false; + } + return true; + } + break; + case GGML_OP_GET_ROWS: { + switch (op->src[0]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } + break; + case GGML_OP_CPY: { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) { + return true; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return true; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return true; + } + return false; + } + break; + case GGML_OP_CONCAT: { + ggml_type src0_type = op->src[0]->type; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + } + break; + case GGML_OP_DUP: + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_REPEAT: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_CONT: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + case GGML_OP_ALIBI: + case GGML_OP_IM2COL: + case GGML_OP_POOL_2D: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_GROUP_NORM: + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_LEAKY_RELU: + return true; + default: + return false; + } +} +# else +static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + switch (op->op) { + case GGML_OP_MUL_MAT: + return true; + default: + return false; + } +} +#endif + + +static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + int node_n = -1; + int task_phase = GGML_TASK_TYPE_FINALIZE; + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + struct ggml_cplan plan = ggml_graph_plan(cgraph, 1); + + buf_element_t * qnn_buf = nullptr; + + if (plan.work_size > 0) { + //plan.work_data = static_cast(malloc(plan.work_size)); + plan.work_data = static_cast(ctx->buffer_pool->buffer_pool_base); + if (plan.work_data == nullptr) { + QNN_LOG_ERROR("malloc failed"); + return GGML_STATUS_FAILED; + } + } + struct ggml_cplan * cplan = &plan; + GGML_ASSERT(cplan->n_threads > 0); + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + + while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + result = GGML_STATUS_ABORTED; + break; + } + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_FINALIZE, + /*.ith =*/ 0, + /*.nth =*/ 0, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (node_n != -1) { + /* FINALIZE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = 1; + ggml_qnn_compute_forward(¶ms, node); + } + } + + while (++node_n < cgraph->n_nodes) { + struct ggml_tensor * node = cgraph->nodes[node_n]; + params.nth = 1; + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_TYPE_INIT; + ggml_qnn_compute_forward(¶ms, node); + } + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_qnn_compute_forward(¶ms, node); + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_TYPE_FINALIZE; + ggml_qnn_compute_forward(¶ms, node); + } + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + result = GGML_STATUS_ABORTED; + break; + } + } + task_phase = GGML_TASK_TYPE_INIT; + if (node_n >= cgraph->n_nodes) { + //QNN_LOG_INFO("node_n %d", node_n); + //QNN_LOG_INFO("cgraph->n_nodes %d", cgraph->n_nodes); + break; + } + } + + //free(plan.work_data); + + return result; +} + + +struct ggml_compute_state_shared { + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; + + const int n_threads; + + // synchronization primitives + atomic_int n_active; // num active threads + atomic_int node_n; // active graph node + atomic_int node_task; // active graph node task phase + + ggml_abort_callback abort_callback; // abort ggml_graph_compute when true + void * abort_callback_data; +}; + +struct ggml_compute_state { + pthread_t thrd; + int ith; + struct ggml_compute_state_shared * shared; + enum ggml_status ec; +}; + + +#ifdef GGML_PERF +#define ggml_perf_time_ms() ggml_time_ms() +#define ggml_perf_time_us() ggml_time_us() +#define ggml_perf_cycles() ggml_cycles() +#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() +#else +#define ggml_perf_time_ms() 0 +#define ggml_perf_time_us() 0 +#define ggml_perf_cycles() 0 +#define ggml_perf_cycles_per_ms() 0 +#endif +#undef MIN +#undef MAX + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + + +static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { + int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles; + int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us; + + node->perf_runs++; + node->perf_cycles += cycles_cur; + node->perf_time_us += time_us_cur; +} + + +static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_node_n = * node_n; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * node_n = atomic_load(&state->shared->node_n); + if (* node_n != last_node_n) break; + } +} + + +static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { + // wait for other threads to finish + const int last_task_phase = * task_phase; + + while (true) { + if (do_yield) { + sched_yield(); + } + + * task_phase = atomic_load(&state->shared->node_task); + if (* task_phase != last_task_phase) break; + } +} + + +static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) { + int n_tasks = 0; + + if (ggml_is_empty(node)) { + // no need to multi-thread a no-op + n_tasks = 1; + return n_tasks; + } + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_ACC: { + n_tasks = n_threads; + } + break; + case GGML_OP_SUB: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_LEAKY_RELU: { + n_tasks = 1; + } + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_HARDSIGMOID: { + n_tasks = 1; + } + break; + + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_SILU: { + n_tasks = n_threads; + } + break; + default: + GGML_ASSERT(false); + } + break; + case GGML_OP_SILU_BACK: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + case GGML_OP_GROUP_NORM: + case GGML_OP_CONCAT: { + n_tasks = n_threads; + } + break; + case GGML_OP_MUL_MAT: { + n_tasks = n_threads; + } + break; + case GGML_OP_MUL_MAT_ID: { + n_tasks = n_threads; + } + break; + case GGML_OP_OUT_PROD: { + n_tasks = n_threads; + } + break; + case GGML_OP_GET_ROWS: { + n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1])); + } + break; + case GGML_OP_SCALE: + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: { + n_tasks = 1; + } + break; + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + case GGML_OP_ADD_REL_POS: { + n_tasks = n_threads; + } + break; + case GGML_OP_ALIBI: { + n_tasks = 1; + } + break; + case GGML_OP_CLAMP: { + n_tasks = 1; + } + break; + case GGML_OP_SOFT_MAX: { + n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); + } + break; + case GGML_OP_CONV_TRANSPOSE_1D: { + n_tasks = n_threads; + } + break; + case GGML_OP_IM2COL: { + n_tasks = n_threads; + } + break; + case GGML_OP_CONV_TRANSPOSE_2D: { + n_tasks = n_threads; + } + break; + case GGML_OP_POOL_1D: + case GGML_OP_POOL_2D: { + n_tasks = 1; + } + break; + case GGML_OP_UPSCALE: { + n_tasks = n_threads; + } + break; + case GGML_OP_PAD: { + n_tasks = n_threads; + } + break; + case GGML_OP_ARANGE: { + n_tasks = n_threads; + } + break; + case GGML_OP_TIMESTEP_EMBEDDING: { + n_tasks = n_threads; + } + break; + case GGML_OP_ARGSORT: { + n_tasks = n_threads; + } + break; + case GGML_OP_FLASH_ATTN: { + n_tasks = n_threads; + } + break; + case GGML_OP_FLASH_FF: { + n_tasks = n_threads; + } + break; + case GGML_OP_FLASH_ATTN_BACK: { + n_tasks = n_threads; + } + break; + case GGML_OP_SSM_CONV: + case GGML_OP_SSM_SCAN: { + n_tasks = n_threads; + } + break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_GET_REL_POS: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: { + n_tasks = 1; + } + break; + case GGML_OP_MAP_CUSTOM1: { + QNN_LOG_ERROR("not support"); + } + break; + case GGML_OP_MAP_CUSTOM2: { + QNN_LOG_ERROR("not support"); + } + break; + case GGML_OP_MAP_CUSTOM3: { + QNN_LOG_ERROR("not support"); + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS: { + n_tasks = n_threads; + } + break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { + n_tasks = n_threads; + } + break; + case GGML_OP_NONE: { + n_tasks = 1; + } + break; + case GGML_OP_COUNT: { + GGML_ASSERT(false); + } + break; + default: { + QNN_LOG_WARN("%s: op not implemented: ", __func__); + if (node->op < GGML_OP_COUNT) { + QNN_LOG_DEBUG("%s\n", ggml_op_name(node->op)); + } else { + QNN_LOG_DEBUG("%d\n", node->op); + } + GGML_ASSERT(false); + } + break; + } + + assert(n_tasks > 0); + + return n_tasks; +} + + +static void * ggml_graph_compute_thread(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int n_threads = state->shared->n_threads; + + int node_n = -1; + int task_phase = GGML_TASK_TYPE_FINALIZE; + + while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + state->shared->node_n += 1; + state->ec = GGML_STATUS_ABORTED; + return 0; + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + // all other threads are finished and spinning + // do finalize and init here so we don't have synchronize again + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_FINALIZE, + /*.ith =*/ 0, + /*.nth =*/ 0, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (node_n != -1) { + /* FINALIZE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + ggml_qnn_compute_forward(¶ms, node); + } + ggml_graph_compute_perf_stats_node(node, state->shared); + } + + // distribute new work or execute it direct if 1T + while (++node_n < cgraph->n_nodes) { + //QNN_LOG_INFO("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + + state->shared->perf_node_start_cycles = ggml_perf_cycles(); + state->shared->perf_node_start_time_us = ggml_perf_time_us(); + + params.nth = n_tasks; + + if (n_tasks == 1) { + /* INIT */ + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_TYPE_INIT; + ggml_qnn_compute_forward(¶ms, node); + } + + // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, + // they do something more efficient than spinning (?) + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_qnn_compute_forward(¶ms, node); + + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_TYPE_FINALIZE; + ggml_qnn_compute_forward(¶ms, node); + } + + ggml_graph_compute_perf_stats_node(node, state->shared); + } else { + break; + } + + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + break; + } + } + + task_phase = GGML_TASK_TYPE_INIT; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_n, node_n); + atomic_store(&state->shared->node_task, task_phase); + } else { + ggml_graph_compute_thread_sync_node(&node_n, state, false); + ggml_graph_compute_thread_sync_task(&task_phase, state, false); + } + + // check if we should stop + if (node_n >= cgraph->n_nodes) break; + + /* INIT & COMPUTE */ + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_INIT, + /*.ith =*/ state->ith, + /*.nth =*/ n_tasks, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + if (state->ith < n_tasks) { + if (GGML_OP_HAS_INIT[node->op]) { + ggml_qnn_compute_forward(¶ms, node); + } + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = GGML_TASK_TYPE_COMPUTE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT; + ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield); + } + + if (state->ith < n_tasks) { + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_qnn_compute_forward(¶ms, node); + } + + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { + task_phase = GGML_TASK_TYPE_FINALIZE; + atomic_store(&state->shared->n_active, n_threads); + atomic_store(&state->shared->node_task, task_phase); + } + else { + ggml_graph_compute_thread_sync_task(&task_phase, state, false); + } + } + + return 0; +} + + +static ggml_status ggml_backend_qnn_graph_compute_multithread(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + int num_threads = ctx->threads; + + if (QNN_GPU == ctx->device || QNN_HTP == ctx->device) { + //TODO:multithreading not supported using QNN GPU/HTP(aka DSP) backend + num_threads = 1; + } + struct ggml_cplan plan = ggml_graph_plan(cgraph, num_threads); + + + if (plan.work_size > 0) { + //QNN_LOG_INFO("work size %d(%d MB)", plan.work_size, plan.work_size / (1 << 20)); + plan.work_data = static_cast(malloc(plan.work_size)); + if (plan.work_data == nullptr) { + QNN_LOG_ERROR("malloc failed"); + return GGML_STATUS_FAILED; + } + } + + struct ggml_cplan * cplan = &plan; + GGML_ASSERT(cplan->n_threads > 0); + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + + //QNN_LOG_DEBUG("cgraph %p, cplan %p, work size %d, work data %p", cgraph, cplan, cplan->work_size, cplan->work_data); + const int n_threads = cplan->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_plan =*/ cplan, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + /*.node_task =*/ GGML_TASK_TYPE_FINALIZE, + /*.abort_callback =*/ nullptr, + /*.abort_callback_data =*/ nullptr, + }; + struct ggml_compute_state * workers = (struct ggml_compute_state*)alloca(sizeof(struct ggml_compute_state) * n_threads); + if (nullptr == workers) { + QNN_LOG_ERROR("malloc failed"); + if (plan.work_data != nullptr) { + free(plan.work_data); + } + return GGML_STATUS_FAILED; + } + + // create thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; ++j) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .ith = j, + .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, + }; + + const int rc = pthread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + GGML_ASSERT(rc == 0); + } + } + + workers[0].ith = 0; + workers[0].shared = &state_shared; + workers[0].ec = GGML_STATUS_SUCCESS; + + // this is a work thread too + ggml_graph_compute_thread(&workers[0]); + enum ggml_status compute_status = workers[0].ec; + + // join or kill thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; j++) { + const int rc = pthread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); + if (workers[j].ec != GGML_STATUS_SUCCESS) + compute_status = workers[j].ec; + } + } + + if (plan.work_data != nullptr) { + free(plan.work_data); + } + + return compute_status; +} + + +static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + const int min_batch_size = 32; + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; +} + + +static ggml_backend_i ggml_backend_qnn_interface = { + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute_multithread, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .offload_op = */ nullptr, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + + +static ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, 0x92, 0xa3, 0xb4, 0xc5, + 0xd6, 0xe7, 0xf8, 0x09}; + return &guid; +} + + +static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); + + return qnn_backend; +} + + +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} + + +void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_qnn(backend)); + + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + ctx->threads = n_threads; +} + +const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { + return backend->iface.get_name(backend); +} + +int ggml_backend_qnn_get_device_count() { + return GGML_QNN_MAX_DEVICES; +} + + +void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size) { + if (nullptr == description || 0 == description_size) { + QNN_LOG_WARN("invalid param"); + return; + } + + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_WARN("invalid param"); + return; + } + + snprintf(description, description_size, "%s", g_qnn_mgr[device].name); + QNN_LOG_DEBUG("description:%s", description); +} + + +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { + if (device_index >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", + device_index, GGML_QNN_MAX_DEVICES - 1); + return nullptr; + } + + static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_buffer_type_qnn; +} + + +/** + * + * @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP) + * @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer + * @return + */ +ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { + int result = 0; + + if (nullptr == qnn_lib_path) + return nullptr; + + QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_ERROR("invalid device %d", device); + return nullptr; + } + + if (nullptr != g_qnn_mgr[device].backend) { + QNN_LOG_ERROR("qnn backend %d(%s) already loaded, it should not happened, pls check why?", device, get_qnn_backend_name(device)); + if (device == g_current_device) { + g_qnn_backend = g_qnn_mgr[device].backend; + QNN_LOG_INFO("re-use cached backend %d(%s)", device, get_qnn_backend_name(device)); + return g_qnn_mgr[device].backend; + } else { + QNN_LOG_INFO("delete previous backend %d(%s)", device, get_qnn_backend_name(device)); + ggml_backend_qnn_free(g_qnn_backend); + } + } + + static bool is_first_call = true; + if (is_first_call) { + ggml_setup_op_has_task_pass(); + is_first_call = false; + } + + if (QNN_HTP == device) { + std::string path = qnn_lib_path; + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + if (0 == setenv("ADSP_LIBRARY_PATH", + (path + + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + } + + qnn_instance * instance = nullptr; + instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); + if (0 != result) { + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", get_qnn_backend_name(device)); + delete instance; + return nullptr; + } + qnn_interface qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface.is_loaded()) { + QNN_LOG_WARN("qnn subsystem failure\n"); + delete instance; + return nullptr; + } + + std::string device_name = GGML_QNN_NAME + std::string("_") + std::to_string(device) + std::string("_") + get_qnn_backend_name(device); + QNN_LOG_INFO("qnn device name %s", device_name.c_str()); + instance->init_qnn_graph(device_name.c_str(), false); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + //TODO:refine internal buffer management + g_qnn_mgr[device].buffer_pool = qnn_buf_new(get_qnn_backend_name(device), GGML_QNN_MAX_BUFFERS, (1 << 20)); + GGML_ASSERT(g_qnn_mgr[device].buffer_pool != nullptr); + + ggml_backend_t qnn_backend = new ggml_backend{ + /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device] + }; + g_qnn_mgr[device].backend = qnn_backend; + g_qnn_backend = g_qnn_mgr[device].backend; + g_current_device = device; + + return qnn_backend; +} + + +extern "C" int ggml_backend_qnn_reg_devices(); + + +int ggml_backend_qnn_reg_devices() { + for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { + int id = g_qnn_mgr[idx].device; + char name[GGML_MAX_NAME]; + ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); + ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), + (void *) (intptr_t)idx); + } + + return GGML_QNN_MAX_DEVICES; +} diff --git a/ggml-qnn.h b/ggml-qnn.h new file mode 100644 index 0000000000000..51f02d4ba3078 --- /dev/null +++ b/ggml-qnn.h @@ -0,0 +1,55 @@ +/* + * MIT license + * Copyright (C) 2024 GGML Authors + * SPDX-License-Identifier: MIT + * + * this is implementation of ggml QNN(Qualcomm Nerual Network, aka AI Engine Direct) backend + */ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +#define GGML_QNN_NAME "QNN" +#define GGML_QNN_MAX_DEVICES 3 + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently +enum QNNBackend { + QNN_CPU, + QNN_GPU, + QNN_HTP, +}; + +GGML_API int ggml_backend_qnn_reg_devices(); + +/** + * + * @param device 0: QNN_CPU 1: QNN_GPU 2: QNN_HTP(aka DSP) + * @param qnn_lib_path qnn library path, such as "/data/data/com.ggml.llamacpp/" on Android which can got by JNI from Java layer + * @return + */ +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path); + +GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); + +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads); + +GGML_API int ggml_backend_qnn_get_device_count(void); +GGML_API void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size); + + +GGML_API ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); + + +//temporary API, should be removed in the future +GGML_API bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 086db96af7fcd..919eb0b7b1ff1 100644 --- a/ggml.c +++ b/ggml.c @@ -16153,7 +16153,8 @@ static void ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// -static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +//workaround for Qualcomm QNN backend +void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) { diff --git a/llama.cpp b/llama.cpp index 30fe190373b43..a10c3e1fc8488 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17,6 +17,8 @@ # include "ggml-sycl.h" #elif defined(GGML_USE_KOMPUTE) # include "ggml-kompute.h" +#elif defined(GGML_USE_QNN) +# include "ggml-qnn.h" #endif #ifdef GGML_USE_METAL @@ -1680,6 +1682,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { if (buft == nullptr) { LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); } +#elif defined(GGML_USE_QNN) + buft = ggml_backend_qnn_buffer_type(gpu); #endif if (buft == nullptr) { @@ -1720,6 +1724,8 @@ static size_t llama_get_device_count() { return ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) return ggml_backend_vk_get_device_count(); +#elif defined(GGML_USE_QNN) + return ggml_backend_qnn_get_device_count(); #else return 1; #endif @@ -15090,6 +15096,8 @@ size_t llama_max_devices(void) { return GGML_SYCL_MAX_DEVICES; #elif defined(GGML_USE_VULKAN) return GGML_VK_MAX_DEVICES; +#elif defined(GGML_USE_QNN) + return GGML_QNN_MAX_DEVICES; #else return 1; #endif @@ -15105,7 +15113,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_QNN) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -15392,6 +15400,17 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#elif defined(GGML_USE_QNN) + if (model->n_gpu_layers > 0) { + //the second param is package name of Andorid app, can be got by JNI from Java layer + ggml_backend_t backend = ggml_backend_qnn_init(QNN_CPU, "/data/data/com.ggml.llamacpp/"); + if (nullptr == backend) { + LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { @@ -17558,6 +17577,14 @@ void llama_reset_timings(struct llama_context * ctx) { ctx->t_p_eval_us = ctx->n_p_eval = 0; } +static int llama_has_qnn(void) { +#ifdef GGML_USE_QNN + return 1; +#else + return 0; +#endif +} + const char * llama_print_system_info(void) { static std::string s; @@ -17579,6 +17606,7 @@ const char * llama_print_system_info(void) { s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; + s += "QNN = " + std::to_string(llama_has_qnn()) + " | "; return s.c_str(); } From d325088dbf8e86722a41b37ef44549b86211742d Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 24 Apr 2024 16:28:18 +0800 Subject: [PATCH 02/16] ggml: add Qualcomm QNN(Qualcomm Neural Network,aka Qualcomm AI Engine Direct) backend --- ggml-qnn.cpp | 3590 ++++++++++++++++++++++++++++++ ggml-qnn.h | 43 + llama.cpp | 23 +- tests/ggml-qnn/CMakeLists.txt | 60 + tests/ggml-qnn/build-ggml-qnn.sh | 95 + tests/ggml-qnn/run-ggml-qnn.sh | 108 + tests/ggml-qnn/test-qnn-ops.cpp | 450 ++++ 7 files changed, 4368 insertions(+), 1 deletion(-) create mode 100644 ggml-qnn.cpp create mode 100644 ggml-qnn.h create mode 100644 tests/ggml-qnn/CMakeLists.txt create mode 100755 tests/ggml-qnn/build-ggml-qnn.sh create mode 100755 tests/ggml-qnn/run-ggml-qnn.sh create mode 100644 tests/ggml-qnn/test-qnn-ops.cpp diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp new file mode 100644 index 0000000000000..9319db227795d --- /dev/null +++ b/ggml-qnn.cpp @@ -0,0 +1,3590 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "QnnTypes.h" +#include "QnnCommon.h" +#include "QnnContext.h" +#include "QnnBackend.h" +#include "QnnGraph.h" +#include "QnnProperty.h" +#include "QnnTensor.h" +#include "QnnInterface.h" +#include "Saver/QnnSaver.h" +#include "System/QnnSystemInterface.h" +#include "HTP/QnnHtpDevice.h" + +#include "ggml-qnn.h" + +#include "ggml-backend-impl.h" + + +// ================================================================================================= +// +// forward/external/helper declaration +// +// ================================================================================================= +class qnn_instance; + + +#if (defined __ANDROID__) || (defined ANDROID) +extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) +__attribute__((__format__(printf, 3, 4))); +#endif +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); + + +// ================================================================================================= +// +// self-defined macro / data structure +// +// ================================================================================================= +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) + +#define GGML_QNN_LOGBUF_LEN 4096 + +#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend + +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGML_QNN_DEBUG +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) + +#define VALIDATE_OP_CONFIG_VERSION(op, err) VALIDATE(validate_op_config_version(op), err) + +#define QNN_VER_PTR(x) (&((x).v1)) +#define QNN_OP_CFG_VALID(op_config) ((op_config).version == QNN_OPCONFIG_VERSION_1) + +#define QNN_OP_CFG_GET_NAME(op_config) get_qnn_oponfig_name(op_config) +#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config) get_qnn_op_config_packagename(op_config) +#define QNN_OP_CFG_GET_TYPE_NAME(op_config) get_qnn_op_config_typename(op_config) +#define QNN_OP_CFG_GET_NUM_PARAMS(op_config) get_qnn_op_config_numparams(op_config) +#define QNN_OP_CFG_GET_PARAMS(op_config) get_qnn_op_config_params(op_config) +#define QNN_OP_CFG_GET_NUM_INPUTS(op_config) get_qnn_op_config_numinputs(op_config) +#define QNN_OP_CFG_GET_INPUTS(op_config) get_qnn_op_config_inputs(op_config) +#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config) get_qnn_op_config_numoutputs(op_config) +#define QNN_OP_CFG_GET_OUTPUTS(op_config) get_qnn_op_config_outputs(op_config) + +#define QNN_OP_CFG_SET_NAME(op_config, value) set_qnn_op_config_name(op_config, value) +#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value) set_qnn_op_config_packagename(op_config, value) +#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value) set_qnn_op_config_typename(op_config, value) + +#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \ + set_qnn_op_config_params(op_config, num_of_params, params) + +#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \ + set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors) + +#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \ + set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) +#define QNN_TENSOR_GET_CLIENT_BUF(tensor) get_qnn_tensor_clientbuf(tensor) +#define QNN_TENSOR_GET_MEM_HANDLE(tensor) get_qnn_tensor_memhandle(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) + + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); + + +typedef void (* ggml_qnn_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (* ggml_qnn_func_common_t)(const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +enum class ggml_qnn_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 +}; + + +struct ggml_backend_qnn_context { + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; +} ; + + +// ================================================================================================= +// +// static global variables +// +// ================================================================================================= +static ggml_backend_t g_qnn_backend = nullptr; + +static int g_current_device = QNN_BACKEND_GGML; + + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { + [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, + [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, + [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, +}; + + +// ================================================================================================= +// +// QNN helper functions and other internal helper functions +// +// ================================================================================================= +static inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, + tensor.version); + return 1; + } + return 0; +} + + +[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) { + if (op_config.version != QNN_OPCONFIG_VERSION_1) { + QNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n", + op_config.v1.name, + op_config.version); + return 1; + } + return 0; +} + + +static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.name; + } + return nullptr; +} + + +[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) { + return get_qnn_oponfig_name(*op_config); +} + + +static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.packageName; + } + return nullptr; +} + + +[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_packagename(*op_config); +} + + +static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.typeName; + } + return nullptr; +} + + +[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_typename(*op_config); +} + + +static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.numOfParams; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_numparams(*op_config); +} + + +static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.params; + } + return nullptr; +} + + +[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_params(*op_config); +} + + +static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.numOfInputs; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_numinputs(*op_config); +} + + +static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.inputTensors; + } + return nullptr; +} + + +[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_inputs(*op_config); +} + + +static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.numOfOutputs; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_numoutputs(*op_config); +} + + +static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + return op_config.v1.outputTensors; + } + return nullptr; +} + + +[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) { + return get_qnn_op_config_outputs(*op_config); +} + + +static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.name = name; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) { + set_qnn_op_config_name(*op_config, name); +} + + +static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.packageName = package_name; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) { + set_qnn_op_config_packagename(*op_config, package_name); +} + + +static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.typeName = type_name; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) { + set_qnn_op_config_typename(*op_config, type_name); +} + + +static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config, + uint32_t num_of_params, + Qnn_Param_t * params) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.numOfParams = num_of_params; + op_config.v1.params = params; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config, + uint32_t num_of_params, + Qnn_Param_t * params) { + set_qnn_op_config_params(*op_config, num_of_params, params); +} + + +static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config, + uint32_t num_of_inputs, + Qnn_Tensor_t * input_tensors) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.numOfInputs = num_of_inputs; + op_config.v1.inputTensors = input_tensors; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config, + uint32_t num_of_inputs, + Qnn_Tensor_t * input_tensors) { + set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors); +} + + +static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config, + uint32_t num_of_outputs, + Qnn_Tensor_t * output_tensors) { + if (op_config.version == QNN_OPCONFIG_VERSION_1) { + op_config.v1.numOfOutputs = num_of_outputs; + op_config.v1.outputTensors = output_tensors; + } +} + + +[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config, + uint32_t num_of_outputs, + Qnn_Tensor_t * output_tensors) { + set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors); +} + + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) { + return get_qnn_tensorid(*tensor); +} + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) { + return get_qnn_tensorname(*tensor); +} + + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + + +[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensortype(*tensor); +} + + +static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + + +[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dataformat(*tensor); +} + + +static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + + +[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_datatype(*tensor); +} + + +static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + + +[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_quantparams(*tensor); +} + + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + + +[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_rank(*tensor); +} + + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + + +[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_dimensions(*tensor); +} + + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + + +[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memtype(*tensor); +} + + +static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.clientBuf; + } + return QNN_CLIENT_BUFFER_INIT; +} + + +[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_clientbuf(*tensor); +} + + +static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memHandle; + } + return nullptr; +} + + +[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) { + return get_qnn_tensor_memhandle(*tensor); +} + + +static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) { + set_qnn_tensor_id(*tensor, id); +} + + +static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) { + set_qnn_tensor_name(*tensor, name); +} + + +static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.type = type; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) { + set_qnn_tensor_type(*tensor, type); +} + + +static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataFormat = format; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) { + set_qnn_tensor_dataformat(*tensor, format); +} + + +static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dataType = dataType; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) { + set_qnn_tensor_datatype(*tensor, dataType); +} + + +static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.quantizeParams = params; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) { + set_qnn_tensor_quantparams(*tensor, params); +} + + +static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.rank = rank; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) { + set_qnn_tensor_rank(*tensor, rank); +} + + +static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.dimensions = dims; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) { + set_qnn_tensor_dimensions(*tensor, dims); +} + + +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memType = memType; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) { + set_qnn_tensor_memtype(*tensor, memType); +} + + +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.clientBuf = clientBuf; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) { + set_qnn_tensor_clientbuf(*tensor, clientBuf); +} + + +static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.memHandle = handle; + } +} + + +[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) { + set_qnn_tensor_memhandle(*tensor, handle); +} + + +static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { + if (!dst || !src || !dstSize || !copySize) + return 0; + + size_t minSize = dstSize < copySize ? dstSize : copySize; + + memcpy(dst, src, minSize); + + return minSize; +} + + +static char * ggml_qnn_strndup(const char * source, size_t maxlen) { + return ::strndup(source, maxlen); +} + + +static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { + int err = 0; + VALIDATE_TENSOR_VERSION(src, err); + + dst.version = src.version; + QNN_TENSOR_SET_NAME( + dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (QNN_TENSOR_GET_NAME(dst) == nullptr) { + return 1; + } + QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); + QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src)); + QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src)); + QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); + QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); + + // Only metadata (i.e. non-static data) is copied from source to destination. The union still + // must be initialized so that the clientBuf/memHandle do not contain garbage data + if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { + Qnn_ClientBuffer_t client_buf = {nullptr, 0}; + QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); + } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) { + QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr); + } else { + return 1; + } + + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); + memscpy(*scaleOffset, + scaleOffsetSize, + src_qparam.axisScaleOffsetEncoding.scaleOffset, + scaleOffsetSize); + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { + // need to allocate and copy memory for scaleOffset as it is a pointer array + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float **scales = &bwaxis_scale_offset.scales; + int32_t **offsets = &bwaxis_scale_offset.offsets; + *scales = (float *)malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); + + // only copy offsets if present, nullptr implies all offsets are 0 + if (bwaxis_scale_offset.offsets != nullptr) { + size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); + *offsets = (int32_t *)malloc(offsetSize); + memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + } + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); + } else { + QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); + } + + // allocate and copy memory for all the pointer members + uint32_t rank = QNN_TENSOR_GET_RANK(src); + QNN_TENSOR_SET_RANK(dst, rank); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *)malloc(dim_size); + if (dimensions == nullptr) { + QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src)); + return 1; + } + memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); + QNN_TENSOR_SET_DIMENSIONS(dst, dimensions); + + return err; +} + + +static int free_qnn_tensor(Qnn_Tensor_t & tensor) { + int err = 0; + VALIDATE_TENSOR_VERSION(tensor, err); + + free((void *) QNN_TENSOR_GET_NAME(tensor)); + free(QNN_TENSOR_GET_DIMENSIONS(tensor)); + + return err; +} + + +[[maybe_unused]] static int free_qnn_tensors(Qnn_Tensor_t *& tensors, uint32_t num_tensors) { + int err = 0; + + // free all pointer allocations in struct + for (size_t i = 0; i < num_tensors; i++) { + free_qnn_tensor(tensors[i]); + } + free(tensors); + + return err; +} + + +static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + + +//TODO: mapping more ggml data type to QNN data type +//ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + default: + break; + + } + return QNN_DATATYPE_UNDEFINED; +} + + +//TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; + } + + return nullptr; +} + + +static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = ggml_get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + return data_size; + */ + return ggml_nbytes(tensor); +} + + +template +Fn load_qnn_functionpointers(void * handle, const char * function_name) { + return reinterpret_cast(dlsym(handle, function_name)); +} + + +static const char * get_qnn_backend_name(int n_backend_type) { + switch (n_backend_type) { + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-NPU"; + case 3: + return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + +#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently + case 3: + return "QNN-cDSP"; + case 4: + return "QNN-HTA"; +#endif + default: + return "unknown"; + } +} + + +static intptr_t align_to(size_t alignment, intptr_t offset) { + return offset % alignment == 0 ? offset + : offset + + (static_cast(alignment) - + offset % static_cast(alignment)); +} + + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggml_qnn_log_internal_mutex; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(ggml_qnn_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + //for Android APK + __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf); +#endif + //for Android command line application or WoA + printf("%s\n", s_ggml_qnn_log_internal_buf); + } + va_end(args); + } +} + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// ================================================================================================= +class qnn_interface { + +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template \ + inline auto qnn_##F(Args... args) const { \ + return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } + + friend class qnn_instance; + +public: + qnn_interface() = default; + + // QnnBackend + DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + + // QnnDevice + DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); + + // QnnContext + DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + + DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); + + // QnnGraph + DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute); + + DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve); + + // QnnLog + DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree); + + DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel); + + // QnnProfile + DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData); + + DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree); + + // QnnMem + DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister); + + DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); + + // QnnProperty + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + + // QnnTensor + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + + // QnnSystem + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); + + void set_qnn_interface(const QnnInterface_t * qnn_interface) { + _qnn_interface = qnn_interface; + } + + void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + _qnn_sys_interface = qnn_sys_interface; + } + + uint32_t get_backend_id() const { + return _qnn_interface->backendId; + } + + bool is_loaded() const { + return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); + } + +private: + const QnnInterface_t *_qnn_interface = nullptr; + + const QnnSystemInterface_t *_qnn_sys_interface = nullptr; +}; + + + +// ================================================================================================= +// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// +// and +// +// resource management of QNN resources for GGML's QNN backend +// ================================================================================================= +class qnn_instance { +public: + using BackendIdType = decltype(QnnInterface_t{}.backendId); + + explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, + const std::string & model_name) : + _lib_path(std::move(lib_path)), + _backend_name(std::move(backend_name)), + _model_name(std::move(model_name)) {}; + + ~qnn_instance() { + } + + int qnn_init(const QnnSaver_Config_t ** saver_config); + + int qnn_finalize(); + + const qnn_interface &get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + + const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + + const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + + const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + + const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } + + + int init_qnn_graph(const char * graph_name, + bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr + ); + + int finalize_qnn_graph(); + + int init_htp_perfinfra() { + QnnDevice_Infrastructure_t device_infra = nullptr; + int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get qnn device infra\n"); + return 1; + } + + QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + uint32_t power_configid = 1; + uint32_t device_id = 0; + uint32_t core_id = 0; + htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + _qnn_htp_perfinfra = htp_perfinfra; + _qnn_power_configid = power_configid; + + return 0; + } + + + int set_rpc_polling() { + if (_qnn_rpc_pollingtime > 0) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; + memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); + rpc_pollingTime.option = + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&rpc_pollingTime, nullptr}; + if (_qnn_htp_perfinfra) { + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + } + } + return 0; + } + + + int set_high_performance_mode() { + if (nullptr == _qnn_htp_perfinfra) { + QNN_LOG_DEBUG("perf intra is null\n"); + return 1; + } + + QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; + memset(&powerConfig, 0, sizeof(powerConfig)); + powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + powerConfig.dcvsV3Config.dcvsEnable = 0; + powerConfig.dcvsV3Config.setDcvsEnable = 1; + powerConfig.dcvsV3Config.contextId = _qnn_power_configid; + powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + powerConfig.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False + powerConfig.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False + powerConfig.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False + powerConfig.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable + powerConfig.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False + // set Sleep latency parameter + uint32_t latencyValue = 40; + powerConfig.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec + // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) + powerConfig.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&powerConfig, nullptr}; + + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + + return 0; + } + + std::string &get_qnn_graph_name() { return _graph_name; } + + bool is_rpcmem_initialized() { + return _rpcmem_initialized; + } + + void set_rpcmem_initialized(bool initialized) { + _rpcmem_initialized = initialized; + } + + int32_t rpcmem_to_fd(void * buf); + + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); + + void unregister_rpcmem(); + + void *alloc_rpcmem(size_t bytes, size_t alignment); + + void free_rpcmem(void * buf); + + bool is_rpcmem_allocated(void * buf); + + bool is_rpcmem_registered(Qnn_MemHandle_t handle) { + return _qnn_mem_set.count(handle) != 0U; + } + +public: + std::map> _qnn_graph_map; + +private: + int load_system(); + + int unload_system(); + + int load_backend(std::string &lib_path, const QnnSaver_Config_t ** saver_config); + + int unload_backend(); + + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; + } + + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + _qnn_raw_system_interface = raw_interface; + } + +private: + static constexpr const int _required_num_providers = 1; + +private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used in currently + BackendIdType _backend_id; + + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + + ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; + + qnn_interface _qnn_interface; + + void *_system_lib_handle = nullptr; + + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; + + Qnn_LogHandle_t _qnn_log_handle = nullptr; + + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; + + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; + + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_set _qnn_mem_set; + + static std::mutex _init_mutex; + static std::unordered_map _loaded_lib_handle; + static std::unordered_map _lib_path_to_backend_id; + static std::unordered_map _loaded_backend; + + void *_rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + + + std::string _graph_name; +}; + + + +// ================================================================================================= +// +// implementation of wrapper class +// +// ================================================================================================= +std::mutex qnn_instance::_init_mutex; + +std::unordered_map qnn_instance::_loaded_lib_handle; + +std::unordered_map qnn_instance::_lib_path_to_backend_id; + +std::unordered_map qnn_instance::_loaded_backend; + + +void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } + + auto allocate_bytes = static_cast(bytes + alignment); + void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } + + auto aligned_buf = reinterpret_cast(align_to(alignment, + reinterpret_cast(buf))); + bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + _pfn_rpc_mem_free(buf); + } + + return aligned_buf; +} + + +void qnn_instance::free_rpcmem(void * buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } +} + + +int32_t qnn_instance::rpcmem_to_fd(void *buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = _pfn_rpc_mem_to_fd(buf); + } + + return mem_fd; +} + + +int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } + + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } + + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + //return 3; + } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = { + {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register( + _qnn_context_handle, + &descriptor, + /*numDescriptors=*/1, + &handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), + strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + } + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert(handle); + + return 0; +} + + +void qnn_instance::unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); + } + + for (auto &mem_handle : _qnn_mem_set) { + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + } + } + _qnn_mem_set.clear(); +} + + +bool qnn_instance::is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; +} + + +int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + + void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + return 1; + } + + // load get_provider function + auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, + "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + return 2; + } + + // get QnnInterface Providers + std::uint32_t num_providers = 0; + const QnnInterface_t **provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + return 3; + } + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; + } + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + break; + } + } + + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", + lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + } + } + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + +#if 0 //comment it for purpose of reduce size of APK + QnnSaver_Config_t outputdir_cfg; + outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY; + outputdir_cfg.outputDirectory = "/data/local/tmp/"; + + QnnSaver_Config_t backendid_cfg; + backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID; + backendid_cfg.backendId = _backend_id; + const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr}; + if (0 == QnnSaver_initialize(saverCfg)) { + QNN_LOG_INFO("QnnSaver_initialize successfully"); + } else { + QNN_LOG_WARN("QnnSaver_initialize failure"); + } +#endif + auto saver_initialize = load_qnn_functionpointers<_pfn_QnnSaver_initialize *>( + _loaded_lib_handle[backend_id], "QnnSaver_initialize"); + if (nullptr != saver_initialize) { + error = saver_initialize(saver_config); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); + return 7; + } + } else { + QNN_LOG_WARN("saver_initialize is null\n"); + } + + return 0; +} + + +int qnn_instance::unload_backend() { + int dlclose_error = 0; + for (auto &it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + } + } + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; +} + + +int qnn_instance::load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + return 1; + } + + auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( + _system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + return 2; + } + + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + return 3; + } + + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + return 4; + } + + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } + + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == + provider_list[idx]->systemApiVersion.major && + QNN_SYSTEM_API_VERSION_MINOR <= + provider_list[idx]->systemApiVersion.minor) { + found_valid_system_interface = true; + qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } + } + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); + + _qnn_interface.set_qnn_system_interface(provider_list[0]); + + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } else { + QNN_LOG_INFO("initialize qnn system successfully\n"); + } + + return 0; +} + + +int qnn_instance::unload_system() { + int result = 0; + + if (nullptr == _system_lib_handle) { + QNN_LOG_DEBUG("system lib handle is null\n"); + return 1; + } + + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } + + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + return 2; + } + + _system_lib_handle = nullptr; + + return result; +} + + +static void ggml_qnn_logcallback(const char * fmt, + QnnLog_Level_t level, + uint64_t timestamp, + va_list argp) { + + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; + + const char * log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = " ERROR "; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = " INFO "; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = " DEBUG "; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + + double ms = (double) timestamp / 1000000.0; + + { + std::lock_guard lock(log_mutex); + + memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); + QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + } +} + + +int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); + + const std::lock_guard lock(_init_mutex); + + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); + } + + std::string bakend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) { + int is_load_ok = load_backend(bakend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } + } + + backend_id = _lib_path_to_backend_id[bakend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n", + bakend_lib_path.c_str(), + _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; + } + + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); + +#if 1 + _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#else + _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); +#endif + if (nullptr == _qnn_log_handle) { + QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } + + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create(_qnn_log_handle, temp_backend_config.empty() ? nullptr + : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } + + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnStatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + auto qnnStatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + if (QNN_SUCCESS != qnnStatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnStatus) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create device successfully\n"); + } + + if (ggml_qnn_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (ggml_qnn_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( + _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 9; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free + || nullptr == _pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 10; + } + + if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr + : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 8; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; +} + + +int qnn_instance::qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy + _pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; +} + + +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, graph_configs, + &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; +} + + +int qnn_instance::finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, _qnn_profile_handle, nullptr) != + QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + //return 1; + } + } else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; +} + + + +// ================================================================================================= +// +// implementation of GGML's QNN backend +// +// ================================================================================================= +static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dump_tensor_info) { + if (nullptr == tensor) + return false; + if (b_dump_tensor_info) { + QNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), + ggml_type_name(tensor->type)); + } + //only support the following 3 OPs currently and ensure tensor->src[0] and tensor->src[1] is not nullptr + bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + if (!supported_op) { + return false; + } + + const struct ggml_tensor * src0 = tensor->src[0]; + const struct ggml_tensor * src1 = tensor->src[1]; + + const int64_t ne00 = tensor->src[0]->ne[0]; + const int64_t ne01 = tensor->src[0]->ne[1]; + + const int64_t ne10 = tensor->src[1]->ne[0]; + const int64_t ne11 = tensor->src[1]->ne[1]; + + const int64_t ne0 = tensor->ne[0]; + const int64_t ne1 = tensor->ne[1]; + + GGML_UNUSED(ne0); + GGML_UNUSED(ne1); + + if (b_dump_tensor_info) { + QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); + QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); + + if (tensor->op == GGML_OP_MUL_MAT) { + QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); + QNN_LOG_DEBUG( + "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG( + "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG( + " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, + tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], + tensor->nb[1], tensor->nb[2]); + + } + } + + if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + return false; + } + + //make ggml_get_tensor_rank and QNN SDK happy + if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { + return false; + } + + if (tensor->op == GGML_OP_ADD) { + //TODO: this is limitation + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) + && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16); + + } + + if (tensor->op == GGML_OP_MUL_MAT) { + //TODO: this is limitation + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) + && (src0->type == src1->type) && (src0->type == tensor->type); + + if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size + return false; + } + + } + + //TODO: this is limitation + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) + && (src0->type == src1->type) && (src0->type == tensor->type); +} + + +static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); +#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; + uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; + uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + //QnnGraph_Config_t graph_config; + //graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + //graph_config.customConfig = strdup(graph_name.c_str()); + //const QnnGraph_Config_t * p_graph_config = &graph_config; + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + //QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; + QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; + QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_duration); +} + + + +/* + * ggml_qnn_mul_mat was re-added as a standalone function because + * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 + * MUL_MAT take most of the compute time (about 95%). So to speed up llama, we have to focus on MUL_MAT. + * We have three kinds of MUL_MAT to compute: + * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f16_f32: src0 is F16 and src1 is F32. + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. +*/ +static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); +#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; + uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; + uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + + if (!graph_initialized) { + graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + return; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + "ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; + QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; + QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", n_duration); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +//common function for GGML OPs using QNN API +static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + struct ggml_backend_qnn_context * ctx = nullptr; + + std::string qnn_graph_name = "ggml_qnn_graph"; + std::string qnn_op_config_name = "ggml_qnn_op_config"; + const char * qnn_op_name = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("pls check why GGML tensor is null"); + return; + } + tensor_0 = (Qnn_Tensor_t *)src0->extra; + tensor_1 = (Qnn_Tensor_t *)src1->extra; + tensor_2 = (Qnn_Tensor_t *)dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("pls check why QNN tensor is null"); + return; + } + ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; + if (nullptr == ctx) { + QNN_LOG_WARN("pls check why backend ctx is null"); + return; + } + instance = ctx->instance; + if (nullptr == instance) { + QNN_LOG_WARN("pls check why qnn instance is null"); + return; + } + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + if (nullptr == qnn_op_name) { + QNN_LOG_WARN("pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, ggml_op_name(ggmlop)); + return; + } + + n_begin_time = ggml_time_us(); +#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); +#endif + + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + graph_initialized = true; + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + } + + uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; + uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; + uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + + if (!graph_initialized) { + qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); + QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); + error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, &graph_handle); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); + return; + } + + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, .v1 = { + qnn_op_config_name.c_str(), + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, + 0, + qnn_params, + 2, + tensor_inputs, + 1, + tensor_outputs + } + }; + error = qnn_raw_interface.graphAddNode(graph_handle, op_config); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + instance->_qnn_graph_map[map_entry] = graph_item; + } else { + auto & graph_item = instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = { + *tensor_0, + *tensor_1 + }; + Qnn_Tensor_t tensor_outputs[] = { + *tensor_2 + }; + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (QNN_SUCCESS != error) { + QNN_LOG_INFO("error = %d\n", error); + } + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; + QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; + QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", ggml_op_name(ggmlop), n_duration); + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_qnn_cpy(src0, dst, nullptr); + (void) src1; +} + + +static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); + +} + + +static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; + QNN_LOG_DEBUG("call %s\n", __func__); + + QNN_LOG_DEBUG("call %s done\n", __func__); +} + + +bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + ggml_qnn_func_t func = nullptr; + ggml_qnn_func_common_t func_common = nullptr; + + switch (tensor->op) { + case GGML_OP_ADD: + func = ggml_qnn_add; + break; + + case GGML_OP_MUL: + func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL_MAT: + func = ggml_qnn_mul_mat; + break; + + case GGML_OP_REPEAT: + func = ggml_qnn_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_qnn_get_rows; + break; + case GGML_OP_DUP: + func = ggml_qnn_dup; + break; + + case GGML_OP_ACC: + func = ggml_qnn_acc; + break; + + case GGML_OP_DIV: + func = ggml_qnn_div; + break; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_qnn_gelu; + break; + case GGML_UNARY_OP_SILU: + func = ggml_qnn_silu; + break; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_qnn_gelu_quick; + break; + case GGML_UNARY_OP_TANH: + func = ggml_qnn_tanh; + break; + case GGML_UNARY_OP_RELU: + func = ggml_qnn_relu; + break; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_qnn_hardsigmoid; + break; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_qnn_hardswish; + break; + default: + return false; + } + break; + case GGML_OP_NORM: + func = ggml_qnn_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_qnn_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_qnn_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_qnn_upscale; + break; + case GGML_OP_PAD: + func = ggml_qnn_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_qnn_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_qnn_rms_norm; + break; + case GGML_OP_MUL_MAT_ID: + func = ggml_qnn_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_qnn_scale; + break; + case GGML_OP_SQR: + func = ggml_qnn_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_qnn_clamp; + break; + case GGML_OP_CPY: + func = ggml_qnn_cpy; + break; + case GGML_OP_CONT: + func = ggml_qnn_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_qnn_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_qnn_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_qnn_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_qnn_rope; + break; + case GGML_OP_IM2COL: + func = ggml_qnn_im2col; + break; + case GGML_OP_POOL_2D: + func = ggml_qnn_pool2d; + break; + case GGML_OP_SUM_ROWS: + func = ggml_qnn_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_qnn_argsort; + break; + default: + return false; + } + + if (nullptr != func) + func(tensor->src[0], tensor->src[1], tensor); + + if (nullptr != func_common) + func_common(tensor->op, tensor->src[0], tensor->src[1], tensor); + + return true; +} + + +struct ggml_backend_qnn_buffer_context { + ~ggml_backend_qnn_buffer_context() { + if (buffer) { + free(buffer); + } + + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } + + for (auto * qnn_tensor : qnn_tensors) { + free_qnn_tensor(*qnn_tensor); + free(qnn_tensor); + } + + sub_buffers.clear(); + qnn_tensors.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_qnn_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; + std::vector qnn_tensors; +}; + + +static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return "QNN"; +} + + +[[maybe_unused]] GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; +} + + +GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + delete ctx; +} + + +GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + return ctx->buffer; +} + + +GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context; + + static int idx = 0; + char tensor_name[GGML_MAX_NAME] = { 0 }; + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); + + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + //TODO:only support FP32 & FP16 + Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + + if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { + qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; + } + Qnn_Tensor_t qnn_tensor = { + .version= QNN_TENSOR_VERSION_1, + {.v1= { + .id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, + .rank = ggml_get_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, + .dataSize = 0}}}} + }; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + if (nullptr == p_qnn_tensor) { + QNN_LOG_WARN("calloc failed"); + return; + } + error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); + if (error != QNN_SUCCESS) { + free(p_qnn_tensor); + QNN_LOG_DEBUG("init tensor failed"); + return; + } + tensor->extra = p_qnn_tensor; + ctx->qnn_tensors.push_back(p_qnn_tensor); +} + + +GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); +} + + +GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_UNUSED(buffer); + memcpy(data, (const char *)tensor->data + offset, size); +} + + +GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + + return false; +} + + +GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + + memset(ctx->buffer, value, ctx->buffer_size); +} + + +[[maybe_unused]] GGML_CALL static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + free(sub_buffer); + } + ctx->sub_buffers.clear(); +} + + +static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, +}; + + +GGML_CALL static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "QNN"; +} + + +static void * ggml_qnn_host_malloc(size_t n) { + void * data = nullptr; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); + return nullptr; + } + + return data; +} + + +GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + //TODO:use pre-allocated buffer in internal memory pool + ctx->buffer = ggml_qnn_host_malloc(size_aligned); + ctx->buffer_size = size_aligned; + + ctx->backend_ctx = &g_qnn_mgr[g_current_device]; + + if (nullptr == ctx->buffer) { + QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); +} + + +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return 32; +} + + +//TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + + return (96 * 1024 * 1024); +} + + +GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, + ggml_backend_t backend) { + GGML_UNUSED(buft); + + return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); +} + + +GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return true; +} + + +GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { + return "QNN"; +} + + +GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { + QNN_LOG_INFO("enter %s", __func__ ); + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + + qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance; + if (instance != nullptr) { + std::map>::iterator graph_it; + for (graph_it = instance->_qnn_graph_map.begin(); graph_it != instance->_qnn_graph_map.end(); graph_it++) { + auto & graph_item = graph_it->second; + Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); + GGML_UNUSED(graph_handle); + QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + } + instance->_qnn_graph_map.clear(); + + instance->qnn_finalize(); + delete instance; + g_qnn_mgr[ctx->device].instance = nullptr; + } + + if (g_qnn_mgr[ctx->device].backend != nullptr) { + delete backend; + g_qnn_backend = nullptr; + g_qnn_mgr[ctx->device].backend = nullptr; + } + QNN_LOG_INFO("leave %s", __func__ ); +} + + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + + return ggml_backend_qnn_buffer_type(ctx->device); +} + + +GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + GGML_UNUSED(ctx); + + ggml_compute_params params = {}; + params.type = GGML_TASK_TYPE_COMPUTE; + params.ith = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + bool ok = ggml_qnn_compute_forward(¶ms, node); + if (!ok) { + QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + } + + return result; +} + + +GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + GGML_UNUSED(backend); + + return (ggml_qnn_can_handle_op(op, true)); +} + + +//note: this function be used with proposal/refined ggml backend subsystem in this PR: +// https://github.com/ggerganov/llama.cpp/pull/7641 +// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// can following this style for mixed inference between CPU&GPU / CPU&NPU very easily +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { + GGML_UNUSED(backend); + + return ggml_qnn_compute_forward(nullptr, (ggml_tensor*)tensor); +} + + +static ggml_backend_i ggml_backend_qnn_interface = { + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .offload_op = */ ggml_backend_qnn_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, +}; + + +static ggml_guid_t ggml_backend_qnn_guid() { + static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, 0x92, 0xa3, 0xb4, 0xc5, + 0xd6, 0xe7, 0xf8, 0x09}; + return &guid; +} + + +static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { + if (nullptr == params) { + //QNN library path + //can be hardcoded to "/data/local/tmp/" for Android command line application + //or specified in JNI layer for Android APK + params = "/data/local/tmp/"; + } + ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); + + return qnn_backend; +} + + +bool ggml_backend_is_qnn(ggml_backend_t backend) { + return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); +} + + +void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_qnn(backend)); + + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + ctx->threads = n_threads; +} + + +const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { + return backend->iface.get_name(backend); +} + + +int ggml_backend_qnn_get_device_count() { + return GGML_QNN_MAX_DEVICES; +} + + +void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size) { + if (nullptr == description || 0 == description_size) { + QNN_LOG_WARN("invalid param"); + return; + } + + if (dev_num >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_WARN("invalid param"); + return; + } + + snprintf(description, description_size, "%s", g_qnn_mgr[dev_num].name); + QNN_LOG_DEBUG("description:%s", description); +} + + +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { + if (device_index >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", + device_index, GGML_QNN_MAX_DEVICES - 1); + return nullptr; + } + + static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_buffer_type_qnn; +} + + +/** + * + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer + * @return + */ +ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { + int result = 0; + + if (nullptr == qnn_lib_path) + return nullptr; + + QNN_LOG_DEBUG("device %d", device); + QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); + if (device >= GGML_QNN_MAX_DEVICES) { + QNN_LOG_ERROR("invalid device %d", device); + return nullptr; + } + + if (nullptr != g_qnn_mgr[device].backend) { + QNN_LOG_ERROR("qnn backend %d(%s) already loaded", device, get_qnn_backend_name(device)); + if (device == g_current_device) { + g_qnn_backend = g_qnn_mgr[device].backend; + QNN_LOG_INFO("re-use cached backend %d(%s)", device, get_qnn_backend_name(device)); + return g_qnn_mgr[device].backend; + } else { + QNN_LOG_INFO("delete previous backend %d(%s)", device, get_qnn_backend_name(device)); + ggml_backend_qnn_free(g_qnn_backend); + } + } + + std::string path = qnn_lib_path; + if (QNN_BACKEND_NPU == device) { + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + if (0 == setenv("ADSP_LIBRARY_PATH", + (path + + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), + 1)) { + QNN_LOG_INFO("QNN DSP backend setenv successfully"); + } else { + QNN_LOG_ERROR("QNN DSP backend setenv failure"); + } + } else { + if (0 == setenv("LD_LIBRARY_PATH", + (path + + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + 1)) { + QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); + } else { + QNN_LOG_ERROR("%s backend setenv failure\n", get_qnn_backend_name(device)); + } + } + + qnn_instance * instance = nullptr; + instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); + result = instance->qnn_init(nullptr); + if (0 != result) { + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", get_qnn_backend_name(device)); + delete instance; + return nullptr; + } + qnn_interface qnn_interface = instance->get_qnn_interface(); + if (!qnn_interface.is_loaded()) { + QNN_LOG_WARN("qnn subsystem failure\n"); + delete instance; + return nullptr; + } + + std::string device_name = get_qnn_backend_name(device); + QNN_LOG_INFO("qnn device name %s", device_name.c_str()); + instance->init_qnn_graph(device_name.c_str(), false); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + + ggml_backend_t qnn_backend = new ggml_backend{ + /* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device] + }; + g_qnn_mgr[device].backend = qnn_backend; + g_qnn_backend = g_qnn_mgr[device].backend; + g_current_device = device; + + return qnn_backend; +} + + +extern "C" GGML_CALL int ggml_backend_qnn_reg_devices(void); + +GGML_CALL int ggml_backend_qnn_reg_devices() { + for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { + char name[GGML_MAX_NAME]; + ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); + ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), + (void *) (intptr_t)idx); + } + + return GGML_QNN_MAX_DEVICES; +} diff --git a/ggml-qnn.h b/ggml-qnn.h new file mode 100644 index 0000000000000..c61ebd25d9ba6 --- /dev/null +++ b/ggml-qnn.h @@ -0,0 +1,43 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +#define GGML_QNN_MAX_DEVICES 3 + +//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +enum QNNBackend { + QNN_BACKEND_CPU, + QNN_BACKEND_GPU, + QNN_BACKEND_NPU, + QNN_BACKEND_GGML, //"fake" QNN backend just for compare performance between QNN and original GGML +}; + +GGML_API int ggml_backend_qnn_reg_devices(void); + +/** + * + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer + * @return + */ +GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path); + +GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); + +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts); + +GGML_API int ggml_backend_qnn_get_device_count(void); + +GGML_API void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size); + +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num); + +#ifdef __cplusplus +} +#endif diff --git a/llama.cpp b/llama.cpp index 06889126ecdc4..42a9cb2a44981 100644 --- a/llama.cpp +++ b/llama.cpp @@ -19,6 +19,8 @@ # include "ggml-sycl.h" #elif defined(GGML_USE_KOMPUTE) # include "ggml-kompute.h" +#elif defined(GGML_USE_QNN) +# include "ggml-qnn.h" #endif #ifdef GGML_USE_METAL @@ -2377,6 +2379,8 @@ static size_t llama_get_device_count(const llama_model & model) { count = ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) count = ggml_backend_vk_get_device_count(); +#elif defined(GGML_USE_QNN) + count = ggml_backend_qnn_get_device_count(); #endif #if defined(GGML_USE_RPC) count += model.rpc_servers.size(); @@ -2409,6 +2413,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ if (buft == nullptr) { LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); } +#elif defined(GGML_USE_QNN) + buft = ggml_backend_qnn_buffer_type(gpu); #endif if (buft == nullptr) { @@ -15899,6 +15905,8 @@ size_t llama_max_devices(void) { return GGML_SYCL_MAX_DEVICES; #elif defined(GGML_USE_VULKAN) return GGML_VK_MAX_DEVICES; +#elif defined(GGML_USE_QNN) + return GGML_QNN_MAX_DEVICES; #else return 1; #endif @@ -15914,7 +15922,7 @@ bool llama_supports_mlock(void) { bool llama_supports_gpu_offload(void) { #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC) || defined(GGML_USE_QNN) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -16225,6 +16233,19 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#elif defined(GGML_USE_QNN) + if (model->n_gpu_layers > 0) { + //the second param is data path of prebuit QNN libs provided by Qualcomm + //can be hardcoded to "/data/local/tmp/" for Android command line application + //or specified in JNI layer for Android APK application + ggml_backend_t backend = ggml_backend_qnn_init(model->main_gpu, "/data/local/tmp/"); + if (nullptr == backend) { + LLAMA_LOG_ERROR("%s: failed to initialize QNN backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } #endif #if defined(GGML_USE_RPC) if (model->n_gpu_layers > 0) { diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt new file mode 100644 index 0000000000000..15ad7be6f6c88 --- /dev/null +++ b/tests/ggml-qnn/CMakeLists.txt @@ -0,0 +1,60 @@ +cmake_minimum_required(VERSION 3.22.1) +project(ggml-qnn-test) + +set(CMAKE_VERBOSE_MAKEFILE on) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +#set to ON if target Android phone is based on Qualcomm Snapdragon 8 Gen 3 +set(TARGET_SNAPDRAGON_8_GEN3 OFF) + +set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) +set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) + +include_directories(${QNN_INC_PATH}) +include_directories(../../) # ggml.h + +set(SOURCE_FILES + ../../ggml.c + ../../ggml-alloc.c + ../../ggml-backend.c + ../../ggml-quants.c + ../../ggml-qnn.cpp + test-qnn-ops.cpp +) + + +message("QNN_SDK_PATH : ${QNN_SDK_PATH}") +message("QNN_INC_PATH : ${QNN_INC_PATH}") +message("QNN_LIB_PATH : ${QNN_LIB_PATH}") + +add_definitions(-D__ARM_NEON) +add_definitions(-DGGML_USE_QNN) + +if(CMAKE_BUILD_TYPE STREQUAL "Release") +add_definitions(-DNDEBUG) +add_definitions(-O3) +endif() + +if (TARGET_SNAPDRAGON_8_GEN3) +# the below build optimization only verified and works well on Qualcomm SM8650-AB Snapdragon 8 Gen 3 +add_definitions(-march=armv8.7-a) +add_definitions(-mcpu=cortex-x1) +add_definitions(-mtune=cortex-x1) + +else() +# the below build optimization might be works well on ALL mainstream Android phone based on Qualcomm mobile SoC +add_definitions(-mcpu=cortex-a72) + +endif() + +add_compile_options("-Wall" "-Wno-sign-compare") + +find_library(LOG_LIB log) + +link_libraries(${LOG_LIB} android) + +add_executable(${TARGET_NAME} + ${SOURCE_FILES} +) diff --git a/tests/ggml-qnn/build-ggml-qnn.sh b/tests/ggml-qnn/build-ggml-qnn.sh new file mode 100755 index 0000000000000..baca02f91347d --- /dev/null +++ b/tests/ggml-qnn/build-ggml-qnn.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +set -e + +#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ + +ANDROID_NDK=`pwd`/android-ndk-r26c +ANDROID_PLATFORM=android-34 +TARGET=ggml-qnn-test + + +function dump_vars() +{ + echo -e "ANDROID_NDK: ${ANDROID_NDK}" + echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" +} + + +function show_pwd() +{ + echo -e "current working path:$(pwd)\n" +} + + +function check_qnn_sdk() +{ + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check...\n" + exit 1 + fi +} + + +function check_and_download_ndk() +{ + is_android_ndk_exist=1 + + if [ ! -d ${ANDROID_NDK} ]; then + is_android_ndk_exist=0 + fi + + if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then + is_android_ndk_exist=0 + fi + + if [ ${is_android_ndk_exist} -eq 0 ]; then + + if [ ! -f android-ndk-r26c-linux.zip ]; then + wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip + fi + + unzip android-ndk-r26c-linux.zip + + if [ $? -ne 0 ]; then + printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" + exit 1 + fi + + printf "android ndk saved to ${ANDROID_NDK} \n\n" + else + printf "android ndk already exist:${ANDROID_NDK} \n\n" + fi +} + + +function build_arm64 +{ + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${TARGET} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + + cd ./out/arm64-v8a + make + + ls -lah ${TARGET} + /bin/cp ${TARGET} ../../ + cd - +} + + +function remove_temp_dir() +{ + if [ -d out ]; then + echo "remove out directory in `pwd`" + rm -rf out + fi +} + + +show_pwd +check_and_download_ndk +check_qnn_sdk +dump_vars +remove_temp_dir +build_arm64 diff --git a/tests/ggml-qnn/run-ggml-qnn.sh b/tests/ggml-qnn/run-ggml-qnn.sh new file mode 100755 index 0000000000000..a4c1f22ad70cd --- /dev/null +++ b/tests/ggml-qnn/run-ggml-qnn.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ +GGML_QNN_TEST=ggml-qnn-test +REMOTE_PATH=/data/local/tmp/ + + +function check_qnn_sdk() +{ + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" + exit 1 + fi +} + + +function check_qnn_libs() +{ + #reuse the cached qnn libs in Android phone + adb shell ls ${REMOTE_PATH}/libQnnCpu.so + if [ $? -eq 0 ]; then + printf "QNN libs already exist on Android phone\n" + else + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + fi +} + + +function show_usage() +{ + echo "Usage:" + echo " $0 GGML_OP_ADD 0/1/2" + echo " $0 GGML_OP_MUL 0/1/2" + echo " $0 GGML_OP_MUL_MAT 0/1/2" + echo -e "\n\n\n" +} + + +function main() +{ + check_qnn_libs + + #upload the latest ggml_qnn_test + adb push ${GGML_QNN_TEST} ${REMOTE_PATH} + adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_TEST} + + case "$ggmlop" in + GGML_OP_ADD) + echo "adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend" + adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend + ;; + + GGML_OP_MUL) + adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL -b $qnnbackend + ;; + + GGML_OP_MUL_MAT) + adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL_MAT -b $qnnbackend + ;; + + *) + printf " \n$arg not supported currently\n" + show_usage + exit 1 + ;; + esac +} + + +check_qnn_sdk + +unset ggmlop +unset qnnbackend +if [ $# == 0 ]; then + show_usage + exit 1 +elif [ $# == 1 ]; then + if [ "$1" == "-h" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + elif [ "$1" == "help" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + else + ggmlop=$1 + qnnbackend=0 + fi +elif [ $# == 2 ]; then + ggmlop=$1 + qnnbackend=$2 +else + show_usage + exit 1 +fi +main $arg diff --git a/tests/ggml-qnn/test-qnn-ops.cpp b/tests/ggml-qnn/test-qnn-ops.cpp new file mode 100644 index 0000000000000..27967270bdcd4 --- /dev/null +++ b/tests/ggml-qnn/test-qnn-ops.cpp @@ -0,0 +1,450 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-qnn.h" + +#define GGML_QNN_DEBUG 1 +#define GGML_QNN_LOGBUF_LEN 4096 + +#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if GGML_QNN_DEBUG +#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + +static void tensor_dump(const ggml_tensor * tensor, const char * name); + +#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) + +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { + static std::mutex ggml_qnn_log_internal_mutex; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(ggml_qnn_log_internal_mutex); + va_list args; + va_start(args, format); + int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { + //for Android command line application or WoA + printf("%s\n", s_ggml_qnn_log_internal_buf); + } + va_end(args); + } +} + + +static const char * get_qnn_backend_name(int n_backend_type) { + switch (n_backend_type) { + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-NPU(HTP/DSP)"; + case 3: + return "ggml"; + default: + return "unknown"; + } +} + + +static bool ggml_graph_compute_helper( + struct ggml_backend * backend, + struct ggml_cgraph * graph, + std::vector & buf, + int n_threads, + ggml_abort_callback abort_callback, + void * abort_callback_data) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + plan.abort_callback = abort_callback; + plan.abort_callback_data = abort_callback_data; + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } + +#ifdef GGML_USE_QNN + if (ggml_backend_is_qnn(backend)) { + ggml_backend_qnn_set_n_threads(backend, n_threads); + } +#endif + + //a new approch of mixed inference + if (nullptr != backend) + return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; + else + return ggml_graph_compute(graph, &plan); +} + + +static void tensor_dump_elements(const ggml_tensor * tensor) { + float value = 0; + std::ostringstream tmposs; + if (tensor->type == GGML_TYPE_F32) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("%s", tmposs.str().c_str()); + } + tmposs.clear(); + tmposs.str(""); + //QNN_LOG_DEBUG("\n"); + } + } + } + } + + //QNN_LOG_DEBUG("\n"); +} + + +static void tensor_dump(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s(%s)", name, tensor->name); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)", + name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); + tensor_dump_elements(tensor); + + QNN_LOG_DEBUG("\n"); +} + + +static uint32_t get_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } + } + return rank; +} + + +static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = get_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; + } + + QNN_LOG_DEBUG("get_tensor_data_size %d", data_size); + QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor)); + + return ggml_nbytes(tensor); +} + + +//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 +static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { + // static RNG initialization (revisit if n_threads stops being constant) + static const size_t n_threads = std::thread::hardware_concurrency(); + static std::vector generators = []() { + std::random_device rd; + std::vector vec; + vec.reserve(n_threads); + //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed + for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); } + return vec; + }(); + + size_t size = ggml_nelements(tensor); + std::vector data(size); + + auto init_thread = [&](size_t ith, size_t start, size_t end) { + std::uniform_real_distribution distribution(min, max); + for (size_t i = start; i < end; i++) { + data[i] = distribution(generators[ith]); + } + }; + + std::vector threads; + threads.reserve(n_threads); + for (size_t i = 0; i < n_threads; i++) { + size_t start = i*size/n_threads; + size_t end = (i+1)*size/n_threads; + threads.emplace_back(init_thread, i, start, end); + } + for (auto & t : threads) { + t.join(); + } + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { + ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { + GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); + std::vector dataq(ggml_row_size(tensor->type, size)); + std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix + const float * im = imatrix.data(); + if (!ggml_quantize_requires_imatrix(tensor->type)) { + // when the imatrix is optional, we want to test both quantization with and without imatrix + // use one of the random numbers to decide + if (data[0] > 0.5f*(min + max)) { + im = nullptr; + } + } + ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); + GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); + ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { + // This is going to create some weird integers though. + ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + } else { + GGML_ASSERT(false); + } +} + + +//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 +static void initialize_tensors(ggml_context * ctx) { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t); + } +} + + +static void show_usage() { + printf(" " \ + "\nUsage: test_qnn_ops [options]\n" \ + "\n" \ + "Options:\n" \ + " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ + " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU)\n" \ + " ?/h print usage infomation\n\n" + ); +} + + +int main(int argc, char * argv[]) { + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + size_t ctx_size = 0; + int sizey = 4; + int sizex = 4; + int num_threads = 4; + int n_backend_type = QNN_BACKEND_CPU; + int n_ggml_op_type = GGML_OP_ADD; + + struct ggml_context * ctx = nullptr; + struct ggml_cgraph * gf = nullptr; + struct ggml_tensor * src0 = nullptr; + struct ggml_tensor * src1 = nullptr; + struct ggml_tensor * dst = nullptr; + ggml_backend_t backend = nullptr; + ggml_backend_buffer_t buffer= nullptr; + ggml_type qtype = GGML_TYPE_F32; + std::vector work_buffer; + + for (int i = 1; i < argc; i++) { + if (0 == strcmp(argv[i], "-t")) { + if (i + 1 < argc) { + if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { + n_ggml_op_type = GGML_OP_ADD; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { + n_ggml_op_type = GGML_OP_MUL_MAT; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { + n_ggml_op_type = GGML_OP_MUL; + } else { + show_usage(); + return 1; + } + i++; + } + } else if (0 == strcmp(argv[i], "-b")) { + if (i + 1 < argc) { + int backend = atoi(argv[i + 1]); + if (backend <= QNN_BACKEND_NPU) + n_backend_type = backend; + else { + show_usage(); + return 1; + } + i++; + } + } else { + show_usage(); + return 1; + } + } + + QNN_LOG_DEBUG("enter qnn_ggml_op\n"); + QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + + n_begin_time = ggml_time_us(); + srand(time(NULL)); + + ctx_size += 1024 * 1024 * 32; + QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, + (ctx_size / 1024 / 1024)); + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /* no_alloc =*/ 0 + }; + + if (n_backend_type != QNN_BACKEND_GGML) { + params.no_alloc = true; + backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); + if (nullptr == backend) { + QNN_LOG_ERROR("create qnn backend %d(%s) failed", n_backend_type, get_qnn_backend_name(n_backend_type)); + return 1; + } + } + + ctx = ggml_init(params); + if (!ctx) { + QNN_LOG_ERROR("%s: ggml_init() failed\n"); + return 2; + } + + QNN_LOG_DEBUG("creating new tensors\n"); + QNN_LOG_DEBUG("ggml_blck_size(%s) %d", ggml_type_name(qtype), ggml_blck_size(qtype)); + QNN_LOG_DEBUG("ggml_type_size(%s) %d", ggml_type_name(qtype), ggml_type_size(qtype)); + if (qtype != GGML_TYPE_F32) { + sizex = ggml_blck_size(qtype); + } + + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + ggml_set_input(src0); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_input(src1); + + switch (n_ggml_op_type) { + case GGML_OP_ADD: + dst = ggml_add(ctx, src0, src1); + break; + case GGML_OP_MUL: + dst = ggml_mul(ctx, src0, src1); + break; + case GGML_OP_MUL_MAT: + dst = ggml_mul_mat(ctx, src0, src1); + break; + default: + QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, + ggml_op_name((enum ggml_op) n_ggml_op_type)); + ggml_free(ctx); + ggml_backend_free(backend); + return 3; + } + + ggml_set_output(dst); +#ifdef GGML_USE_QNN + if (n_backend_type != QNN_BACKEND_GGML) { + buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + if (!buffer) { + QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__); + ggml_free(ctx); + ggml_backend_free(backend); + return 4; + } + } +#endif + + QNN_LOG_DEBUG("creating compute graph\n"); + gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, dst); + +#if 0 + ggml_set_f32(src0, (rand() % 100 + 1)); + ggml_set_f32(src1, (rand() % 100 + 1)); + ggml_set_f32(dst, 0.0f); +#else + if (n_backend_type != QNN_BACKEND_GGML) { + initialize_tensors(ctx); + } +#endif + + ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); + if (get_tensor_data_size(dst) < (32 * 32)) { + QNN_LOG_DEBUG("dump tensors:\n"); + TENSOR_DUMP(src0); + TENSOR_DUMP(src1); + TENSOR_DUMP(dst); + } else { + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + } + + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); + + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + + return 0; +} From 9c872cbbce2fb76b11766fb4012e9206b27726b9 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 5 Jun 2024 12:06:17 +0800 Subject: [PATCH 03/16] refine ggml-qnn-ut program and script to make reviewers happy --- tests/ggml-qnn/CMakeLists.txt | 2 +- tests/ggml-qnn/build-ggml-qnn.sh | 95 --------- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 198 ++++++++++++++++++ .../{test-qnn-ops.cpp => ggml-qnn-ut.cpp} | 0 tests/ggml-qnn/run-ggml-qnn.sh | 108 ---------- 5 files changed, 199 insertions(+), 204 deletions(-) delete mode 100755 tests/ggml-qnn/build-ggml-qnn.sh create mode 100755 tests/ggml-qnn/ggml-qnn-ut-build-run.sh rename tests/ggml-qnn/{test-qnn-ops.cpp => ggml-qnn-ut.cpp} (100%) delete mode 100755 tests/ggml-qnn/run-ggml-qnn.sh diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index 15ad7be6f6c88..a78bdaeaf8009 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -21,7 +21,7 @@ set(SOURCE_FILES ../../ggml-backend.c ../../ggml-quants.c ../../ggml-qnn.cpp - test-qnn-ops.cpp + ggml-qnn-ut.cpp ) diff --git a/tests/ggml-qnn/build-ggml-qnn.sh b/tests/ggml-qnn/build-ggml-qnn.sh deleted file mode 100755 index baca02f91347d..0000000000000 --- a/tests/ggml-qnn/build-ggml-qnn.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash - -set -e - -#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ - -ANDROID_NDK=`pwd`/android-ndk-r26c -ANDROID_PLATFORM=android-34 -TARGET=ggml-qnn-test - - -function dump_vars() -{ - echo -e "ANDROID_NDK: ${ANDROID_NDK}" - echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" -} - - -function show_pwd() -{ - echo -e "current working path:$(pwd)\n" -} - - -function check_qnn_sdk() -{ - if [ ! -d ${QNN_SDK_PATH} ]; then - echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check...\n" - exit 1 - fi -} - - -function check_and_download_ndk() -{ - is_android_ndk_exist=1 - - if [ ! -d ${ANDROID_NDK} ]; then - is_android_ndk_exist=0 - fi - - if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then - is_android_ndk_exist=0 - fi - - if [ ${is_android_ndk_exist} -eq 0 ]; then - - if [ ! -f android-ndk-r26c-linux.zip ]; then - wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip - fi - - unzip android-ndk-r26c-linux.zip - - if [ $? -ne 0 ]; then - printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" - exit 1 - fi - - printf "android ndk saved to ${ANDROID_NDK} \n\n" - else - printf "android ndk already exist:${ANDROID_NDK} \n\n" - fi -} - - -function build_arm64 -{ - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${TARGET} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} - - cd ./out/arm64-v8a - make - - ls -lah ${TARGET} - /bin/cp ${TARGET} ../../ - cd - -} - - -function remove_temp_dir() -{ - if [ -d out ]; then - echo "remove out directory in `pwd`" - rm -rf out - fi -} - - -show_pwd -check_and_download_ndk -check_qnn_sdk -dump_vars -remove_temp_dir -build_arm64 diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh new file mode 100755 index 0000000000000..c7bff2ee9c20e --- /dev/null +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -0,0 +1,198 @@ +#!/bin/bash + +set -e + +#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct +#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools +QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ + +ANDROID_NDK=`pwd`/android-ndk-r26c +ANDROID_PLATFORM=android-34 + +GGML_QNN_UT=ggml-qnn-ut +REMOTE_PATH=/data/local/tmp/ + + +function dump_vars() +{ + echo -e "ANDROID_NDK: ${ANDROID_NDK}" + echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}" +} + + +function show_pwd() +{ + echo -e "current working path:$(pwd)\n" +} + + +function check_qnn_sdk() +{ + if [ ! -d ${QNN_SDK_PATH} ]; then + echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" + exit 1 + fi +} + + +function check_and_download_ndk() +{ + is_android_ndk_exist=1 + + if [ ! -d ${ANDROID_NDK} ]; then + is_android_ndk_exist=0 + fi + + if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then + is_android_ndk_exist=0 + fi + + if [ ${is_android_ndk_exist} -eq 0 ]; then + + if [ ! -f android-ndk-r26c-linux.zip ]; then + wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip + fi + + unzip android-ndk-r26c-linux.zip + + if [ $? -ne 0 ]; then + printf "failed to download android ndk to %s \n" "${ANDROID_NDK}" + exit 1 + fi + + printf "android ndk saved to ${ANDROID_NDK} \n\n" + else + printf "android ndk already exist:${ANDROID_NDK} \n\n" + fi +} + + +function build_arm64 +{ + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + + cd ./out/arm64-v8a + make + + ls -lah ${GGML_QNN_UT} + /bin/cp ${GGML_QNN_UT} ../../ + cd - +} + + +function remove_temp_dir() +{ + if [ -d out ]; then + echo "remove out directory in `pwd`" + rm -rf out + fi +} + + +function check_qnn_libs() +{ + #reuse the cached qnn libs in Android phone + adb shell ls ${REMOTE_PATH}/libQnnCpu.so + if [ $? -eq 0 ]; then + printf "QNN libs already exist on Android phone\n" + else + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + fi +} + + +function build_ggml_qnn_ut() +{ + show_pwd + check_and_download_ndk + check_qnn_sdk + dump_vars + remove_temp_dir + build_arm64 +} + + +function run_ggml_qnn_ut() +{ + check_qnn_libs + + #upload the latest ggml_qnn_test + adb push ${GGML_QNN_UT} ${REMOTE_PATH} + adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_UT} + + case "$ggmlop" in + GGML_OP_ADD) + echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend" + adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend + ;; + + GGML_OP_MUL) + adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL -b $qnnbackend + ;; + + GGML_OP_MUL_MAT) + adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend + ;; + + *) + printf " \n$arg not supported currently\n" + show_usage + exit 1 + ;; + esac +} + + +function show_usage() +{ + echo "Usage:" + echo " $0 build" + echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo -e "\n\n\n" +} + + +unset ggmlop +unset qnnbackend + +check_qnn_sdk + +if [ $# == 0 ]; then + show_usage + exit 1 +elif [ $# == 1 ]; then + if [ "$1" == "-h" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + elif [ "$1" == "help" ]; then + #avoid upload command line program to Android phone in this scenario + show_usage + exit 1 + elif [ "$1" == "build" ]; then + build_ggml_qnn_ut + exit 0 + else + ggmlop=$1 + qnnbackend=0 + run_ggml_qnn_ut + fi +elif [ $# == 2 ]; then + ggmlop=$1 + qnnbackend=$2 + run_ggml_qnn_ut +else + show_usage + exit 1 +fi diff --git a/tests/ggml-qnn/test-qnn-ops.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp similarity index 100% rename from tests/ggml-qnn/test-qnn-ops.cpp rename to tests/ggml-qnn/ggml-qnn-ut.cpp diff --git a/tests/ggml-qnn/run-ggml-qnn.sh b/tests/ggml-qnn/run-ggml-qnn.sh deleted file mode 100755 index a4c1f22ad70cd..0000000000000 --- a/tests/ggml-qnn/run-ggml-qnn.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash - -#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct -#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ -GGML_QNN_TEST=ggml-qnn-test -REMOTE_PATH=/data/local/tmp/ - - -function check_qnn_sdk() -{ - if [ ! -d ${QNN_SDK_PATH} ]; then - echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct...\n" - exit 1 - fi -} - - -function check_qnn_libs() -{ - #reuse the cached qnn libs in Android phone - adb shell ls ${REMOTE_PATH}/libQnnCpu.so - if [ $? -eq 0 ]; then - printf "QNN libs already exist on Android phone\n" - else - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ - fi -} - - -function show_usage() -{ - echo "Usage:" - echo " $0 GGML_OP_ADD 0/1/2" - echo " $0 GGML_OP_MUL 0/1/2" - echo " $0 GGML_OP_MUL_MAT 0/1/2" - echo -e "\n\n\n" -} - - -function main() -{ - check_qnn_libs - - #upload the latest ggml_qnn_test - adb push ${GGML_QNN_TEST} ${REMOTE_PATH} - adb shell chmod +x ${REMOTE_PATH}/${GGML_QNN_TEST} - - case "$ggmlop" in - GGML_OP_ADD) - echo "adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend" - adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_ADD -b $qnnbackend - ;; - - GGML_OP_MUL) - adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL -b $qnnbackend - ;; - - GGML_OP_MUL_MAT) - adb shell ${REMOTE_PATH}/${GGML_QNN_TEST} -t GGML_OP_MUL_MAT -b $qnnbackend - ;; - - *) - printf " \n$arg not supported currently\n" - show_usage - exit 1 - ;; - esac -} - - -check_qnn_sdk - -unset ggmlop -unset qnnbackend -if [ $# == 0 ]; then - show_usage - exit 1 -elif [ $# == 1 ]; then - if [ "$1" == "-h" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - elif [ "$1" == "help" ]; then - #avoid upload command line program to Android phone in this scenario - show_usage - exit 1 - else - ggmlop=$1 - qnnbackend=0 - fi -elif [ $# == 2 ]; then - ggmlop=$1 - qnnbackend=$2 -else - show_usage - exit 1 -fi -main $arg From 926a8661f31c85499314c3b15f47c0709041ee07 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 5 Jun 2024 21:10:59 +0800 Subject: [PATCH 04/16] review: replace external declaration with NDK header file --- ggml-qnn.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 9319db227795d..15c6538d1870d 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -50,6 +50,9 @@ #include "ggml-backend-impl.h" +#if (defined __ANDROID__) || (defined ANDROID) +#include +#endif // ================================================================================================= // @@ -58,11 +61,6 @@ // ================================================================================================= class qnn_instance; - -#if (defined __ANDROID__) || (defined ANDROID) -extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) -__attribute__((__format__(printf, 3, 4))); -#endif static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); From dd29834c115f5c644b34fb7e60c0175b9890da29 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 6 Jun 2024 17:12:28 +0800 Subject: [PATCH 05/16] add supportive of quantize data type Q8_0 --- ggml-qnn.cpp | 176 +++++++++------ ggml-qnn.h | 5 +- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 37 ++-- tests/ggml-qnn/ggml-qnn-ut.cpp | 274 ++++++++++++++++-------- 4 files changed, 321 insertions(+), 171 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 15c6538d1870d..d0927f22e514a 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -72,8 +72,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define RPCMEM_DEFAULT_FLAGS 1 #define RPCMEM_HEAP_ID_SYSTEM 25 -#define GGML_DUMP_TENSOR(tensor) ggml_tensor_dump(tensor, #tensor) - #define GGML_QNN_LOGBUF_LEN 4096 #define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend @@ -195,8 +193,17 @@ static ggml_backend_t g_qnn_backend = nullptr; static int g_current_device = QNN_BACKEND_GGML; - -//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently +//according to the QNN SDK Reference Guide, +//CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend +//GPU - Choose a non-quantized model. Quantized models are currently incompatible with the GPU backend +//HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +//DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +//HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +// +//only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently +//Qualcomm CPU: Qualcomm Kryo CPU +//Qualcomm GPU: Qualcomm Adreno GPU +//Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, @@ -849,6 +856,10 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; default: break; @@ -903,14 +914,8 @@ static const char * get_qnn_backend_name(int n_backend_type) { case 2: return "QNN-NPU"; case 3: - return "ggml"; //the default GGML backend, used to compare performance between QNN backend and the default GGML backend + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML -#if 0 //QNN cDSP and HTA backend would not be used currently, focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently - case 3: - return "QNN-cDSP"; - case 4: - return "QNN-HTA"; -#endif default: return "unknown"; } @@ -1720,7 +1725,7 @@ static void ggml_qnn_logcallback(const char * fmt, double ms = (double) timestamp / 1000000.0; - { + if (0) { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); @@ -1770,7 +1775,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); #endif if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN("why failed to initialize qnn log\n"); //DSP backend not work on Qualcomm SoC based low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); @@ -2010,14 +2015,14 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; - const int64_t ne00 = tensor->src[0]->ne[0]; - const int64_t ne01 = tensor->src[0]->ne[1]; + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = tensor->src[1]->ne[0]; - const int64_t ne11 = tensor->src[1]->ne[1]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; + const int64_t ne0 = tensor->ne[0]; + const int64_t ne1 = tensor->ne[1]; GGML_UNUSED(ne0); GGML_UNUSED(ne1); @@ -2057,30 +2062,15 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum return false; } - if (tensor->op == GGML_OP_ADD) { - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16); - + // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size + if (tensor->ne[1] < 32) { + return false; } - if (tensor->op == GGML_OP_MUL_MAT) { - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (src0->type == src1->type) && (src0->type == tensor->type); + int qtype = src0->type; + return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || qtype == GGML_TYPE_Q8_0) + && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); - if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size - return false; - } - - } - - //TODO: this is limitation - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) - && (src0->type == src1->type) && (src0->type == tensor->type); } @@ -2129,7 +2119,7 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; n_begin_time = ggml_time_us(); -#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2147,17 +2137,23 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; @@ -2197,6 +2193,16 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2245,6 +2251,11 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2255,10 +2266,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2337,7 +2344,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; n_begin_time = ggml_time_us(); -#if 1 QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2355,17 +2361,23 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - std::string map_entry = std::string(ggml_op_name(ggmlop)); + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; @@ -2401,6 +2413,16 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -2543,7 +2565,7 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr } n_begin_time = ggml_time_us(); -#if 1 + QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", src0->name, @@ -2561,11 +2583,17 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); -#endif QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], + (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + std::string map_entry = std::string(ggml_op_name(ggmlop)); if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { @@ -2606,6 +2634,16 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + Qnn_Tensor_t tensor_inputs[] = { *tensor_0, *tensor_1 @@ -3125,10 +3163,9 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t char tensor_name[GGML_MAX_NAME] = { 0 }; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); - uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; - //TODO:only support FP32 & FP16 - Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32; - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; + Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); + Qnn_TensorType_t qnn_tensor_type= QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; @@ -3365,7 +3402,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const //note: this function be used with proposal/refined ggml backend subsystem in this PR: // https://github.com/ggerganov/llama.cpp/pull/7641 -// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// any ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { GGML_UNUSED(backend); @@ -3481,7 +3518,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ @@ -3516,22 +3553,21 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), 1)) { - QNN_LOG_INFO("QNN DSP backend setenv successfully"); + QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { - QNN_LOG_ERROR("QNN DSP backend setenv failure"); + QNN_LOG_ERROR("QNN NPU backend setenv failure"); } if (0 == setenv("ADSP_LIBRARY_PATH", (path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), 1)) { - QNN_LOG_INFO("QNN DSP backend setenv successfully"); + QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { - QNN_LOG_ERROR("QNN DSP backend setenv failure"); + QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { if (0 == setenv("LD_LIBRARY_PATH", - (path + - ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + path.c_str(), 1)) { QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); } else { diff --git a/ggml-qnn.h b/ggml-qnn.h index c61ebd25d9ba6..9ea3dcda62c64 100644 --- a/ggml-qnn.h +++ b/ggml-qnn.h @@ -10,19 +10,18 @@ extern "C" { #define GGML_QNN_MAX_DEVICES 3 -//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently enum QNNBackend { QNN_BACKEND_CPU, QNN_BACKEND_GPU, QNN_BACKEND_NPU, - QNN_BACKEND_GGML, //"fake" QNN backend just for compare performance between QNN and original GGML + QNN_BACKEND_GGML, //"fake" QNN backend, used for compare performance between QNN and original GGML }; GGML_API int ggml_backend_qnn_reg_devices(void); /** * - * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU(aka HTP/DSP) + * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU * @param qnn_lib_path qnn library path, such as "/data/local/tmp/" on Android or specified in JNI layer * @return */ diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index c7bff2ee9c20e..192f2f4bda2f5 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -4,7 +4,8 @@ set -e #https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools -QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/ +#QNN SDK released on 20240531 +QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.23.0.240531/ ANDROID_NDK=`pwd`/android-ndk-r26c ANDROID_PLATFORM=android-34 @@ -89,6 +90,23 @@ function remove_temp_dir() } +function update_qnn_libs() +{ + check_qnn_sdk + + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ + + #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ + adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ +} + + function check_qnn_libs() { #reuse the cached qnn libs in Android phone @@ -96,16 +114,7 @@ function check_qnn_libs() if [ $? -eq 0 ]; then printf "QNN libs already exist on Android phone\n" else - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so ${REMOTE_PATH}/ - adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${REMOTE_PATH}/ + update_qnn_libs fi } @@ -155,7 +164,8 @@ function run_ggml_qnn_ut() function show_usage() { echo "Usage:" - echo " $0 build" + echo " $0 build (build Android command line UT program)" + echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" @@ -183,6 +193,9 @@ elif [ $# == 1 ]; then elif [ "$1" == "build" ]; then build_ggml_qnn_ut exit 0 + elif [ "$1" == "updateqnnlibs" ]; then + update_qnn_libs + exit 0 else ggmlop=$1 qnnbackend=0 diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 27967270bdcd4..1041252f3770f 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -87,7 +87,7 @@ static const char * get_qnn_backend_name(int n_backend_type) { case 1: return "QNN-GPU"; case 2: - return "QNN-NPU(HTP/DSP)"; + return "QNN-NPU"; case 3: return "ggml"; default: @@ -131,9 +131,54 @@ static bool ggml_graph_compute_helper( } -static void tensor_dump_elements(const ggml_tensor * tensor) { +#define QK8_0 32 +typedef struct { + uint16_t d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; + + +static inline float ggml_compute_fp16_to_fp32(uint16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(uint16_t)); + return (float)tmp; +} +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static void tensor_dump(const ggml_tensor * tensor, const char * name) { + QNN_LOG_DEBUG("dump ggml tensor %s(%s): type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + name, tensor->name, + tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], + tensor->nb[0], tensor->nb[1], tensor->nb[2]); + float value = 0; std::ostringstream tmposs; + if (nullptr == tensor) { + QNN_LOG_WARN("tensor is null"); + return; + } + if (tensor->type == GGML_TYPE_I8) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + value = ((int8_t *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + tmposs << "\n"; + } + } + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } + if (tensor->type == GGML_TYPE_F32) { for (int h = 0; h < tensor->ne[3]; h++) { for (int i = 0; i < tensor->ne[2]; i++) { @@ -144,31 +189,59 @@ static void tensor_dump_elements(const ggml_tensor * tensor) { tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value << " "; } - if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { - QNN_LOG_DEBUG("%s", tmposs.str().c_str()); - } - tmposs.clear(); - tmposs.str(""); - //QNN_LOG_DEBUG("\n"); + tmposs << "\n"; } } } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } } - //QNN_LOG_DEBUG("\n"); -} - - -static void tensor_dump(const ggml_tensor * tensor, const char * name) { - QNN_LOG_DEBUG("dump ggml tensor %s(%s)", name, tensor->name); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)", - name, - tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], tensor->nb[1], tensor->nb[2]); - tensor_dump_elements(tensor); + if (tensor->type == GGML_TYPE_F16) { + for (int h = 0; h < tensor->ne[3]; h++) { + for (int i = 0; i < tensor->ne[2]; i++) { + for (int j = 0; j < tensor->ne[1]; j++) { + for (int k = 0; k < tensor->ne[0]; k++) { + unsigned short tmpvalue = ((unsigned short *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] + + j * tensor->ne[0] + k]; + value = GGML_FP16_TO_FP32(tmpvalue); + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + tmposs << "\n"; + } + } + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } - QNN_LOG_DEBUG("\n"); + if (tensor->type == GGML_TYPE_Q8_0) { + block_q8_0 * tmp = ((block_q8_0 *)tensor->data); + for (int j = 0; j < tensor->ne[1]; j++) { + int n = tensor->ne[0] / QK8_0; //blocks per row + for (int z = 0; z < n; z++) { + const float d = GGML_FP16_TO_FP32(tmp[ j * n + z ].d); + for (int k = 0; k < QK8_0; k++) { + value = tmp[j * n + z].qs[k] * d; + tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value + << " "; + } + } + tmposs << "\n"; + } + if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) { + QNN_LOG_DEBUG("\n%s\n", tmposs.str().c_str()); + tmposs.clear(); + tmposs.str(""); + } + } } @@ -231,7 +304,8 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { - ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + //ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); + memcpy((char*)tensor->data, data.data(), size * sizeof(float)); } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); @@ -246,10 +320,12 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); - ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + //ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); + memcpy((char*)tensor->data, dataq.data(), dataq.size()); } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. - ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + //ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); } else { GGML_ASSERT(false); } @@ -276,16 +352,13 @@ static void show_usage() { } -int main(int argc, char * argv[]) { +static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; int64_t n_duration = 0LL; size_t ctx_size = 0; int sizey = 4; int sizex = 4; - int num_threads = 4; - int n_backend_type = QNN_BACKEND_CPU; - int n_ggml_op_type = GGML_OP_ADD; struct ggml_context * ctx = nullptr; struct ggml_cgraph * gf = nullptr; @@ -294,50 +367,23 @@ int main(int argc, char * argv[]) { struct ggml_tensor * dst = nullptr; ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer= nullptr; - ggml_type qtype = GGML_TYPE_F32; - std::vector work_buffer; - for (int i = 1; i < argc; i++) { - if (0 == strcmp(argv[i], "-t")) { - if (i + 1 < argc) { - if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { - n_ggml_op_type = GGML_OP_ADD; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { - n_ggml_op_type = GGML_OP_MUL_MAT; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { - n_ggml_op_type = GGML_OP_MUL; - } else { - show_usage(); - return 1; - } - i++; - } - } else if (0 == strcmp(argv[i], "-b")) { - if (i + 1 < argc) { - int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_NPU) - n_backend_type = backend; - else { - show_usage(); - return 1; - } - i++; - } - } else { - show_usage(); - return 1; - } - } + ggml_type qtype = GGML_TYPE_I8; + qtype = GGML_TYPE_F32; + qtype = GGML_TYPE_F16; + qtype = GGML_TYPE_Q8_0; + std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); - QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + n_begin_time = ggml_time_us(); srand(time(NULL)); ctx_size += 1024 * 1024 * 32; QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size, - (ctx_size / 1024 / 1024)); + (ctx_size / 1024 / 1024)); struct ggml_init_params params = { /*.mem_size =*/ ctx_size, @@ -349,7 +395,7 @@ int main(int argc, char * argv[]) { params.no_alloc = true; backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/"); if (nullptr == backend) { - QNN_LOG_ERROR("create qnn backend %d(%s) failed", n_backend_type, get_qnn_backend_name(n_backend_type)); + QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, get_qnn_backend_name(n_backend_type)); return 1; } } @@ -361,15 +407,25 @@ int main(int argc, char * argv[]) { } QNN_LOG_DEBUG("creating new tensors\n"); - QNN_LOG_DEBUG("ggml_blck_size(%s) %d", ggml_type_name(qtype), ggml_blck_size(qtype)); - QNN_LOG_DEBUG("ggml_type_size(%s) %d", ggml_type_name(qtype), ggml_type_size(qtype)); - if (qtype != GGML_TYPE_F32) { + QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype)); + QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype)); + if (ggml_is_quantized(qtype)) { sizex = ggml_blck_size(qtype); + + if (n_ggml_op_type == GGML_OP_MUL_MAT) { + sizex = ggml_blck_size(qtype) * 2; + } } + QNN_LOG_DEBUG("sizex %d\n", sizex); - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + if (n_ggml_op_type == GGML_OP_MUL) { + src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + } else { + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + } ggml_set_input(src0); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); ggml_set_input(src1); switch (n_ggml_op_type) { @@ -384,7 +440,7 @@ int main(int argc, char * argv[]) { break; default: QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type, - ggml_op_name((enum ggml_op) n_ggml_op_type)); + ggml_op_name((enum ggml_op) n_ggml_op_type)); ggml_free(ctx); ggml_backend_free(backend); return 3; @@ -407,17 +463,20 @@ int main(int argc, char * argv[]) { gf = ggml_new_graph(ctx); ggml_build_forward_expand(gf, dst); -#if 0 - ggml_set_f32(src0, (rand() % 100 + 1)); - ggml_set_f32(src1, (rand() % 100 + 1)); - ggml_set_f32(dst, 0.0f); -#else if (n_backend_type != QNN_BACKEND_GGML) { initialize_tensors(ctx); + } else { + if (qtype == GGML_TYPE_F32) { + ggml_set_f32(src0, (rand() % 100 + 1)); + } else { + initialize_tensors(ctx); + } + ggml_set_f32(src1, (rand() % 100 + 1)); + //ggml_set_f32(dst, 0.0f); } -#endif ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); + if (get_tensor_data_size(dst) < (32 * 32)) { QNN_LOG_DEBUG("dump tensors:\n"); TENSOR_DUMP(src0); @@ -425,26 +484,69 @@ int main(int argc, char * argv[]) { TENSOR_DUMP(dst); } else { QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); + src0->name, + src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], + src0->nb[0], src0->nb[1], src0->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); + src1->name, + src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], + src1->nb[0], src1->nb[1], src1->nb[2]); QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); + dst->name, + dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); } ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); - n_end_time = ggml_time_us(); n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); + return 0; +} + + +int main(int argc, char * argv[]) { + int num_threads = 4; + int n_backend_type = QNN_BACKEND_CPU; + int n_ggml_op_type = GGML_OP_ADD; + + for (int i = 1; i < argc; i++) { + if (0 == strcmp(argv[i], "-t")) { + if (i + 1 < argc) { + if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) { + n_ggml_op_type = GGML_OP_ADD; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { + n_ggml_op_type = GGML_OP_MUL_MAT; + } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { + n_ggml_op_type = GGML_OP_MUL; + } else { + show_usage(); + return 1; + } + i++; + } + } else if (0 == strcmp(argv[i], "-b")) { + if (i + 1 < argc) { + int backend = atoi(argv[i + 1]); + if (backend <= QNN_BACKEND_NPU) + n_backend_type = backend; + else { + show_usage(); + return 1; + } + i++; + } + } else { + show_usage(); + return 1; + } + } + + QNN_LOG_DEBUG("enter qnn_ggml_op\n"); + QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); return 0; } From f4c53037abff299f20a1d40e1247e29d2d7b82dc Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 6 Jun 2024 20:24:03 +0800 Subject: [PATCH 06/16] review: remove unused QNN helper functions --- ggml-qnn.cpp | 404 +-------------------------------------------------- 1 file changed, 8 insertions(+), 396 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index d0927f22e514a..e81704305e988 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -54,6 +54,7 @@ #include #endif + // ================================================================================================= // // forward/external/helper declaration @@ -61,6 +62,7 @@ // ================================================================================================= class qnn_instance; + static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); @@ -74,7 +76,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define GGML_QNN_LOGBUF_LEN 4096 -#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend +#define GGML_QNN_DEBUG 0 //for troubleshooting QNN backend #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) @@ -86,6 +88,8 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define QNN_LOG_DEBUG(...) #endif +#define QNN_VER_PTR(x) (&((x).v1)) + #define VALIDATE(value, status) \ do { \ @@ -98,34 +102,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) -#define VALIDATE_OP_CONFIG_VERSION(op, err) VALIDATE(validate_op_config_version(op), err) - -#define QNN_VER_PTR(x) (&((x).v1)) -#define QNN_OP_CFG_VALID(op_config) ((op_config).version == QNN_OPCONFIG_VERSION_1) - -#define QNN_OP_CFG_GET_NAME(op_config) get_qnn_oponfig_name(op_config) -#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config) get_qnn_op_config_packagename(op_config) -#define QNN_OP_CFG_GET_TYPE_NAME(op_config) get_qnn_op_config_typename(op_config) -#define QNN_OP_CFG_GET_NUM_PARAMS(op_config) get_qnn_op_config_numparams(op_config) -#define QNN_OP_CFG_GET_PARAMS(op_config) get_qnn_op_config_params(op_config) -#define QNN_OP_CFG_GET_NUM_INPUTS(op_config) get_qnn_op_config_numinputs(op_config) -#define QNN_OP_CFG_GET_INPUTS(op_config) get_qnn_op_config_inputs(op_config) -#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config) get_qnn_op_config_numoutputs(op_config) -#define QNN_OP_CFG_GET_OUTPUTS(op_config) get_qnn_op_config_outputs(op_config) - -#define QNN_OP_CFG_SET_NAME(op_config, value) set_qnn_op_config_name(op_config, value) -#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value) set_qnn_op_config_packagename(op_config, value) -#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value) set_qnn_op_config_typename(op_config, value) - -#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \ - set_qnn_op_config_params(op_config, num_of_params, params) - -#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \ - set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors) - -#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \ - set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors) - #define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) #define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) #define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) @@ -135,8 +111,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) #define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) #define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) -#define QNN_TENSOR_GET_CLIENT_BUF(tensor) get_qnn_tensor_clientbuf(tensor) -#define QNN_TENSOR_GET_MEM_HANDLE(tensor) get_qnn_tensor_memhandle(tensor) #define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) #define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) @@ -150,7 +124,6 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) - using pfn_rpc_mem_init = void (*)(void); using pfn_rpc_mem_deinit = void (*)(void); using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); @@ -204,6 +177,7 @@ static int g_current_device = QNN_BACKEND_GGML; //Qualcomm CPU: Qualcomm Kryo CPU //Qualcomm GPU: Qualcomm Adreno GPU //Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) + static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, @@ -227,221 +201,6 @@ static inline int validate_tensor_version(Qnn_Tensor_t tensor) { } -[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) { - if (op_config.version != QNN_OPCONFIG_VERSION_1) { - QNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n", - op_config.v1.name, - op_config.version); - return 1; - } - return 0; -} - - -static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.name; - } - return nullptr; -} - - -[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) { - return get_qnn_oponfig_name(*op_config); -} - - -static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.packageName; - } - return nullptr; -} - - -[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_packagename(*op_config); -} - - -static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.typeName; - } - return nullptr; -} - - -[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_typename(*op_config); -} - - -static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.numOfParams; - } - return 0u; -} - - -[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_numparams(*op_config); -} - - -static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.params; - } - return nullptr; -} - - -[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_params(*op_config); -} - - -static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.numOfInputs; - } - return 0u; -} - - -[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_numinputs(*op_config); -} - - -static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.inputTensors; - } - return nullptr; -} - - -[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_inputs(*op_config); -} - - -static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.numOfOutputs; - } - return 0u; -} - - -[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_numoutputs(*op_config); -} - - -static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - return op_config.v1.outputTensors; - } - return nullptr; -} - - -[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) { - return get_qnn_op_config_outputs(*op_config); -} - - -static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.name = name; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) { - set_qnn_op_config_name(*op_config, name); -} - - -static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.packageName = package_name; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) { - set_qnn_op_config_packagename(*op_config, package_name); -} - - -static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.typeName = type_name; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) { - set_qnn_op_config_typename(*op_config, type_name); -} - - -static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config, - uint32_t num_of_params, - Qnn_Param_t * params) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.numOfParams = num_of_params; - op_config.v1.params = params; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config, - uint32_t num_of_params, - Qnn_Param_t * params) { - set_qnn_op_config_params(*op_config, num_of_params, params); -} - - -static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config, - uint32_t num_of_inputs, - Qnn_Tensor_t * input_tensors) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.numOfInputs = num_of_inputs; - op_config.v1.inputTensors = input_tensors; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config, - uint32_t num_of_inputs, - Qnn_Tensor_t * input_tensors) { - set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors); -} - - -static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config, - uint32_t num_of_outputs, - Qnn_Tensor_t * output_tensors) { - if (op_config.version == QNN_OPCONFIG_VERSION_1) { - op_config.v1.numOfOutputs = num_of_outputs; - op_config.v1.outputTensors = output_tensors; - } -} - - -[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config, - uint32_t num_of_outputs, - Qnn_Tensor_t * output_tensors) { - set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors); -} - - static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.id; @@ -451,11 +210,6 @@ static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) { - return get_qnn_tensorid(*tensor); -} - - static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.name; @@ -464,10 +218,6 @@ static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { } -static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) { - return get_qnn_tensorname(*tensor); -} - static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { @@ -477,11 +227,6 @@ static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) { - return get_qnn_tensortype(*tensor); -} - - static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataFormat; @@ -490,11 +235,6 @@ static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_ } -[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_dataformat(*tensor); -} - - static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataType; @@ -503,11 +243,6 @@ static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor } -[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_datatype(*tensor); -} - - static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.quantizeParams; @@ -516,11 +251,6 @@ static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t } -[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_quantparams(*tensor); -} - - static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.rank; @@ -529,11 +259,6 @@ static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_rank(*tensor); -} - - static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dimensions; @@ -542,11 +267,6 @@ static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) } -[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_dimensions(*tensor); -} - - static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.memType; @@ -555,37 +275,6 @@ static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & te } -[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_memtype(*tensor); -} - - -static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.clientBuf; - } - return QNN_CLIENT_BUFFER_INIT; -} - - -[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_clientbuf(*tensor); -} - - -static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memHandle; - } - return nullptr; -} - - -[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) { - return get_qnn_tensor_memhandle(*tensor); -} - - static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.id = id; @@ -593,11 +282,6 @@ static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { } -[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) { - set_qnn_tensor_id(*tensor, id); -} - - static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.name = name; @@ -605,11 +289,6 @@ static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) } -[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) { - set_qnn_tensor_name(*tensor, name); -} - - static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.type = type; @@ -617,11 +296,6 @@ static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t t } -[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) { - set_qnn_tensor_type(*tensor, type); -} - - static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataFormat = format; @@ -629,11 +303,6 @@ static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDa } -[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) { - set_qnn_tensor_dataformat(*tensor, format); -} - - static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataType = dataType; @@ -641,11 +310,6 @@ static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t } -[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) { - set_qnn_tensor_datatype(*tensor, dataType); -} - - static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.quantizeParams = params; @@ -653,11 +317,6 @@ static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_Quantiz } -[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) { - set_qnn_tensor_quantparams(*tensor, params); -} - - static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.rank = rank; @@ -665,11 +324,6 @@ static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { } -[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) { - set_qnn_tensor_rank(*tensor, rank); -} - - static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dimensions = dims; @@ -677,11 +331,6 @@ static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * d } -[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) { - set_qnn_tensor_dimensions(*tensor, dims); -} - - static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.memType = memType; @@ -689,11 +338,6 @@ static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemTy } -[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) { - set_qnn_tensor_memtype(*tensor, memType); -} - - static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.clientBuf = clientBuf; @@ -701,11 +345,6 @@ static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuf } -[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) { - set_qnn_tensor_clientbuf(*tensor, clientBuf); -} - - static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.memHandle = handle; @@ -713,11 +352,6 @@ static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle } -[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) { - set_qnn_tensor_memhandle(*tensor, handle); -} - - static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { if (!dst || !src || !dstSize || !copySize) return 0; @@ -824,19 +458,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { } -[[maybe_unused]] static int free_qnn_tensors(Qnn_Tensor_t *& tensors, uint32_t num_tensors) { - int err = 0; - - // free all pointer allocations in struct - for (size_t i = 0; i < num_tensors; i++) { - free_qnn_tensor(tensors[i]); - } - free(tensors); - - return err; -} - - static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -3137,7 +2758,7 @@ static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffe } -[[maybe_unused]] GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { +GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; } @@ -3236,15 +2857,6 @@ GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer } -[[maybe_unused]] GGML_CALL static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; - for (auto * sub_buffer : ctx->sub_buffers) { - free(sub_buffer); - } - ctx->sub_buffers.clear(); -} - - static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .get_name = */ ggml_backend_qnn_buffer_get_name, /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, @@ -3402,7 +3014,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const //note: this function be used with proposal/refined ggml backend subsystem in this PR: // https://github.com/ggerganov/llama.cpp/pull/7641 -// any ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) +// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { GGML_UNUSED(backend); From 2fab33d8250db70e872a12af7ffd41af04592acc Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Fri, 7 Jun 2024 12:51:04 +0800 Subject: [PATCH 07/16] ggml-qnn: remove static global vars to support multi-instance simultaneously --- ggml-qnn.cpp | 250 +++++++++++++++------------------ tests/ggml-qnn/ggml-qnn-ut.cpp | 3 +- 2 files changed, 113 insertions(+), 140 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index e81704305e988..867f01625ad7f 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -76,7 +76,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #define GGML_QNN_LOGBUF_LEN 4096 -#define GGML_QNN_DEBUG 0 //for troubleshooting QNN backend +#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) @@ -89,7 +89,7 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const #endif #define QNN_VER_PTR(x) (&((x).v1)) - +#define GGML_QNN_NAME "qnn" #define VALIDATE(value, status) \ do { \ @@ -135,8 +135,6 @@ using _pfn_QnnInterface_getProviders = decltype(QnnInterface_ using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); -typedef void (* ggml_qnn_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); -typedef void (* ggml_qnn_func_common_t)(const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); enum class ggml_qnn_profile_level { profile_off = 0, @@ -144,7 +142,6 @@ enum class ggml_qnn_profile_level { profile_detail = 2 }; - struct ggml_backend_qnn_context { int device; int threads; @@ -156,15 +153,16 @@ struct ggml_backend_qnn_context { QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; } ; +typedef void (* ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +typedef void (* ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); // ================================================================================================= // // static global variables // // ================================================================================================= -static ggml_backend_t g_qnn_backend = nullptr; - -static int g_current_device = QNN_BACKEND_GGML; +//static ggml_backend_t g_qnn_backend = nullptr; //according to the QNN SDK Reference Guide, //CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend @@ -184,7 +182,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, }; - // ================================================================================================= // // QNN helper functions and other internal helper functions @@ -1010,7 +1007,7 @@ void qnn_instance::free_rpcmem(void * buf) { } -int32_t qnn_instance::rpcmem_to_fd(void *buf) { +int32_t qnn_instance::rpcmem_to_fd(void * buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -1168,33 +1165,6 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * _loaded_lib_handle[backend_id] = lib_handle; _backend_id = backend_id; -#if 0 //comment it for purpose of reduce size of APK - QnnSaver_Config_t outputdir_cfg; - outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY; - outputdir_cfg.outputDirectory = "/data/local/tmp/"; - - QnnSaver_Config_t backendid_cfg; - backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID; - backendid_cfg.backendId = _backend_id; - const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr}; - if (0 == QnnSaver_initialize(saverCfg)) { - QNN_LOG_INFO("QnnSaver_initialize successfully"); - } else { - QNN_LOG_WARN("QnnSaver_initialize failure"); - } -#endif - auto saver_initialize = load_qnn_functionpointers<_pfn_QnnSaver_initialize *>( - _loaded_lib_handle[backend_id], "QnnSaver_initialize"); - if (nullptr != saver_initialize) { - error = saver_initialize(saver_config); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to saver_initialize,error %d", QNN_GET_ERROR_CODE(error)); - return 7; - } - } else { - QNN_LOG_WARN("saver_initialize is null\n"); - } - return 0; } @@ -1345,14 +1315,15 @@ static void ggml_qnn_logcallback(const char * fmt, } double ms = (double) timestamp / 1000000.0; - - if (0) { +#if GGML_QNN_DEBUG + { std::lock_guard lock(log_mutex); memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } +#endif } @@ -1390,11 +1361,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); -#if 1 _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); -#else - _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); -#endif if (nullptr == _qnn_log_handle) { QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone return 4; @@ -1437,7 +1404,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; + return 6; } else { QNN_LOG_DEBUG("initialize qnn profile successfully\n"); } @@ -1456,7 +1423,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); if (nullptr == _rpc_lib_handle) { QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 9; + return 8; } else { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); @@ -1470,7 +1437,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { || nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); - return 10; + return 9; } if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy @@ -1483,7 +1450,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 8; + return 10; } else { QNN_LOG_DEBUG("initialize qnn context successfully\n"); } @@ -1695,7 +1662,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum } -static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; int64_t n_begin_time = 0LL; @@ -1703,7 +1670,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm int64_t n_duration = 0LL; qnn_instance * instance = nullptr; - struct ggml_backend_qnn_context * ctx = nullptr; std::string graph_name = "ggml_op_qnn_add"; Qnn_GraphHandle_t graph_handle = nullptr; @@ -1727,7 +1693,6 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm QNN_LOG_WARN("pls check why QNN tensor is null"); return; } - ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; if (nullptr == ctx) { QNN_LOG_WARN("pls check why backend ctx is null"); return; @@ -1755,9 +1720,9 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; @@ -1918,7 +1883,7 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm * mul_mat_f16_f32: src0 is F16 and src1 is F32. * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. */ -static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; int64_t n_begin_time = 0LL; @@ -1926,7 +1891,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, int64_t n_duration = 0LL; qnn_instance * instance = nullptr; - struct ggml_backend_qnn_context * ctx = nullptr; std::string graph_name = "ggml_op_qnn_mul_mat"; Qnn_GraphHandle_t graph_handle = nullptr; @@ -1952,7 +1916,6 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, QNN_LOG_WARN("pls check why QNN tensor is null"); return; } - ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; if (nullptr == ctx) { QNN_LOG_WARN("pls check why backend ctx is null"); return; @@ -1979,9 +1942,9 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; @@ -2129,7 +2092,7 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, //common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; int64_t n_begin_time = 0LL; @@ -2137,7 +2100,6 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr int64_t n_duration = 0LL; qnn_instance * instance = nullptr; - struct ggml_backend_qnn_context * ctx = nullptr; std::string qnn_graph_name = "ggml_qnn_graph"; std::string qnn_op_config_name = "ggml_qnn_op_config"; @@ -2164,7 +2126,6 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr QNN_LOG_WARN("pls check why QNN tensor is null"); return; } - ctx = (struct ggml_backend_qnn_context *)g_qnn_backend->context; if (nullptr == ctx) { QNN_LOG_WARN("pls check why backend ctx is null"); return; @@ -2201,9 +2162,9 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2)); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; @@ -2349,153 +2310,154 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr } -static void ggml_qnn_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_qnn_cpy(src0, dst, nullptr); +static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_qnn_cpy(ctx, src0, dst, nullptr); (void) src1; } -static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, +static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); @@ -2504,35 +2466,35 @@ static void ggml_qnn_mul_mat_id(const ggml_tensor * src0, } -static void ggml_qnn_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); QNN_LOG_DEBUG("call %s\n", __func__); @@ -2541,21 +2503,21 @@ static void ggml_qnn_rope(const ggml_tensor * src0, const ggml_tensor * src1, gg } -static void ggml_qnn_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { QNN_LOG_DEBUG("call %s\n", __func__); QNN_LOG_DEBUG("call %s done\n", __func__); } -static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); QNN_LOG_DEBUG("call %s\n", __func__); @@ -2563,7 +2525,7 @@ static void ggml_qnn_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1 } -static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); QNN_LOG_DEBUG("call %s\n", __func__); @@ -2571,7 +2533,7 @@ static void ggml_qnn_argsort(const ggml_tensor * src0, const ggml_tensor * src1, } -static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { (void) src0; (void) src1; (void) dst; @@ -2581,7 +2543,7 @@ static void ggml_qnn_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggm } -bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { ggml_qnn_func_t func = nullptr; ggml_qnn_func_common_t func_common = nullptr; @@ -2715,16 +2677,21 @@ bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_t } if (nullptr != func) - func(tensor->src[0], tensor->src[1], tensor); + func(ctx, tensor->src[0], tensor->src[1], tensor); if (nullptr != func_common) - func_common(tensor->op, tensor->src[0], tensor->src[1], tensor); + func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); return true; } struct ggml_backend_qnn_buffer_context { + ggml_backend_qnn_buffer_context(size_t device) : + device(device), + name(GGML_QNN_NAME + std::to_string(device)) { + } + ~ggml_backend_qnn_buffer_context() { if (buffer) { free(buffer); @@ -2749,6 +2716,14 @@ struct ggml_backend_qnn_buffer_context { size_t buffer_size = 0; std::vector sub_buffers; std::vector qnn_tensors; + size_t device; + std::string name; +}; + + +struct ggml_backend_qnn_buffer_type_context { + size_t device; + std::string name; }; @@ -2782,7 +2757,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t static int idx = 0; char tensor_name[GGML_MAX_NAME] = { 0 }; - snprintf(tensor_name, GGML_MAX_NAME, "tensor_%2d", idx++); + snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); @@ -2888,7 +2863,8 @@ static void * ggml_qnn_host_malloc(size_t n) { GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context; + ggml_backend_qnn_buffer_type_context * buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; + ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -2901,7 +2877,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer ctx->buffer = ggml_qnn_host_malloc(size_aligned); ctx->buffer_size = size_aligned; - ctx->backend_ctx = &g_qnn_mgr[g_current_device]; + ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; if (nullptr == ctx->buffer) { QNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20)); @@ -2968,7 +2944,6 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { if (g_qnn_mgr[ctx->device].backend != nullptr) { delete backend; - g_qnn_backend = nullptr; g_qnn_mgr[ctx->device].backend = nullptr; } QNN_LOG_INFO("leave %s", __func__ ); @@ -2995,7 +2970,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } - bool ok = ggml_qnn_compute_forward(¶ms, node); + bool ok = ggml_qnn_compute_forward(ctx, ¶ms, node); if (!ok) { QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -3017,9 +2992,9 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const // new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) // can following this style for mixed inference between CPU&GPU / CPU&NPU very easily GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { - GGML_UNUSED(backend); + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - return ggml_qnn_compute_forward(nullptr, (ggml_tensor*)tensor); + return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor*)tensor); } @@ -3104,27 +3079,36 @@ void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, } -ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { - if (device_index >= GGML_QNN_MAX_DEVICES) { +ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { + if (device >= GGML_QNN_MAX_DEVICES) { QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", - device_index, GGML_QNN_MAX_DEVICES - 1); + device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } - static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = { - /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, - /* .is_host = */ ggml_backend_qnn_buffer_is_host - }, - /* .context = */ nullptr, - }; + static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; + + static bool ggml_backend_qnn_buffer_type_initialized = false; + + if (!ggml_backend_qnn_buffer_type_initialized) { + for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + ggml_backend_qnn_buffer_types[i] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_qnn_buffer_is_host + }, + /* .context = */ new ggml_backend_qnn_buffer_type_context { device, GGML_QNN_NAME + std::to_string(device) }, + }; + } + ggml_backend_qnn_buffer_type_initialized = true; + } - return &ggml_backend_buffer_type_qnn; + return &ggml_backend_qnn_buffer_types[device]; } @@ -3137,8 +3121,10 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) { ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { int result = 0; - if (nullptr == qnn_lib_path) + if (nullptr == qnn_lib_path) { + QNN_LOG_ERROR("invalid qnn lib path\n"); return nullptr; + } QNN_LOG_DEBUG("device %d", device); QNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path); @@ -3147,18 +3133,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { return nullptr; } - if (nullptr != g_qnn_mgr[device].backend) { - QNN_LOG_ERROR("qnn backend %d(%s) already loaded", device, get_qnn_backend_name(device)); - if (device == g_current_device) { - g_qnn_backend = g_qnn_mgr[device].backend; - QNN_LOG_INFO("re-use cached backend %d(%s)", device, get_qnn_backend_name(device)); - return g_qnn_mgr[device].backend; - } else { - QNN_LOG_INFO("delete previous backend %d(%s)", device, get_qnn_backend_name(device)); - ggml_backend_qnn_free(g_qnn_backend); - } - } - std::string path = qnn_lib_path; if (QNN_BACKEND_NPU == device) { if (0 == setenv("LD_LIBRARY_PATH", @@ -3215,8 +3189,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { /* .context = */ &g_qnn_mgr[device] }; g_qnn_mgr[device].backend = qnn_backend; - g_qnn_backend = g_qnn_mgr[device].backend; - g_current_device = device; return qnn_backend; } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 1041252f3770f..eb072beae6bd4 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -133,7 +133,7 @@ static bool ggml_graph_compute_helper( #define QK8_0 32 typedef struct { - uint16_t d; // delta + uint16_t d; // delta int8_t qs[QK8_0]; // quants } block_q8_0; @@ -158,6 +158,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { QNN_LOG_WARN("tensor is null"); return; } + if (tensor->type == GGML_TYPE_I8) { for (int h = 0; h < tensor->ne[3]; h++) { for (int i = 0; i < tensor->ne[2]; i++) { From 94ee77505832bdaf5fa72fd72c2fd4031c57eefc Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Fri, 7 Jun 2024 14:56:07 +0800 Subject: [PATCH 08/16] review: remove static global vars to support multi-instance simultaneously and thread safe --- ggml-qnn.cpp | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 867f01625ad7f..f45a6449ccae3 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -697,9 +697,9 @@ class qnn_interface { } private: - const QnnInterface_t *_qnn_interface = nullptr; + const QnnInterface_t * _qnn_interface = nullptr; - const QnnSystemInterface_t *_qnn_sys_interface = nullptr; + const QnnSystemInterface_t * _qnn_sys_interface = nullptr; }; @@ -848,7 +848,7 @@ class qnn_instance { return 0; } - std::string &get_qnn_graph_name() { return _graph_name; } + std::string & get_qnn_graph_name() { return _graph_name; } bool is_rpcmem_initialized() { return _rpcmem_initialized; @@ -911,7 +911,7 @@ class qnn_instance { qnn_interface _qnn_interface; - void *_system_lib_handle = nullptr; + void * _system_lib_handle = nullptr; Qnn_GraphHandle_t _qnn_graph_handle = nullptr; @@ -927,7 +927,7 @@ class qnn_instance { QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr; + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing @@ -936,12 +936,12 @@ class qnn_instance { std::unordered_set _qnn_mem_set; - static std::mutex _init_mutex; - static std::unordered_map _loaded_lib_handle; - static std::unordered_map _lib_path_to_backend_id; - static std::unordered_map _loaded_backend; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; - void *_rpc_lib_handle = nullptr; + void * _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{false}; pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; pfn_rpc_mem_free _pfn_rpc_mem_free; @@ -950,26 +950,15 @@ class qnn_instance { pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; - std::string _graph_name; }; - // ================================================================================================= // // implementation of wrapper class // // ================================================================================================= -std::mutex qnn_instance::_init_mutex; - -std::unordered_map qnn_instance::_loaded_lib_handle; - -std::unordered_map qnn_instance::_lib_path_to_backend_id; - -std::unordered_map qnn_instance::_loaded_backend; - - void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -977,14 +966,13 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { } auto allocate_bytes = static_cast(bytes + alignment); - void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); return nullptr; } - auto aligned_buf = reinterpret_cast(align_to(alignment, - reinterpret_cast(buf))); + auto aligned_buf = reinterpret_cast(align_to(alignment,reinterpret_cast(buf))); bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); @@ -1097,7 +1085,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); return 1; @@ -1113,7 +1101,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * // get QnnInterface Providers std::uint32_t num_providers = 0; - const QnnInterface_t **provider_list = nullptr; + const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); From 5d691c6cd05b4ff51f181272b8cb4df0dcb0e0ba Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sat, 8 Jun 2024 09:22:39 +0800 Subject: [PATCH 09/16] review: put qnn's internal log inside preprocessor diretive --- ggml-qnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f45a6449ccae3..072003e1d76b8 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1277,6 +1277,7 @@ static void ggml_qnn_logcallback(const char * fmt, uint64_t timestamp, va_list argp) { +#if GGML_QNN_DEBUG static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; @@ -1303,7 +1304,6 @@ static void ggml_qnn_logcallback(const char * fmt, } double ms = (double) timestamp / 1000000.0; -#if GGML_QNN_DEBUG { std::lock_guard lock(log_mutex); From fdf0272dfb29cd640de92d6e54dce448c48a156e Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sat, 8 Jun 2024 17:56:32 +0800 Subject: [PATCH 10/16] review: code format using clang-format + manually modification according to review comments --- ggml-qnn.cpp | 2793 +++++++++++++++++++++++++------------------------- 1 file changed, 1414 insertions(+), 1379 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 072003e1d76b8..3c5ff332a1df2 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -54,132 +54,166 @@ #include #endif - // ================================================================================================= // -// forward/external/helper declaration +// forward declaration // // ================================================================================================= class qnn_instance; - -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); - +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...); // ================================================================================================= // // self-defined macro / data structure // // ================================================================================================= -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 -#define GGML_QNN_LOGBUF_LEN 4096 +#define GGML_QNN_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNN_LOG 0 // enable/disable QNN internal log +#define GGML_QNN_LOGBUF_LEN 4096 +#define QNN_VER_PTR(x) (&((x).v1)) +#define GGML_QNN_NAME "qnn" -#define GGML_QNN_DEBUG 1 //for troubleshooting QNN backend +#define QNN_LOG_ERROR(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#define QNN_LOG_INFO(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_WARN(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_INFO(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define QNN_LOG_DEBUG(...) \ + ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #else #define QNN_LOG_DEBUG(...) #endif -#define QNN_VER_PTR(x) (&((x).v1)) -#define GGML_QNN_NAME "qnn" - -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) - -#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) - -#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) - -using pfn_rpc_mem_init = void (*)(void); -using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*)(int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); - -using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); -using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); -using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); - - +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define VALIDATE_TENSOR_VERSION(tensor, err) \ + VALIDATE(validate_tensor_version(tensor), err) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) \ + set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) \ + set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) \ + set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) \ + set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) \ + set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) \ + set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) \ + set_qnn_tensor_memhandle(tensor, value) + +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); +using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); +using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); enum class ggml_qnn_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 + profile_off = 0, + profile_basic = 1, + profile_detail = 2 }; struct ggml_backend_qnn_context { - int device; - int threads; - char name[GGML_MAX_NAME]; - char lib[GGML_MAX_NAME]; - qnn_instance * instance; - struct ggml_backend * backend; - QNN_INTERFACE_VER_TYPE raw_interface; + int device; + int threads; + char name[GGML_MAX_NAME]; + char lib[GGML_MAX_NAME]; + qnn_instance * instance; + struct ggml_backend * backend; + QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; -} ; - -typedef void (* ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); - -typedef void (* ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, const ggml_op ggml_op, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +}; -// ================================================================================================= -// -// static global variables -// -// ================================================================================================= -//static ggml_backend_t g_qnn_backend = nullptr; - -//according to the QNN SDK Reference Guide, -//CPU - Choose a non-quantized model. Quantized models are currently incompatible with the CPU backend -//GPU - Choose a non-quantized model. Quantized models are currently incompatible with the GPU backend -//HTP - Choose a quantized model. Quantized models are required when running on the HTP backend -//DSP - Choose a quantized model. Quantized models are required when running on the DSP backend -//HTA - Choose a quantized model. Quantized models are required when running on the HTA backend +typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); + +typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, + const ggml_op ggml_op, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); + +// according to the QNN SDK Reference Guide, +// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend +// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend +// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend +// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend +// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend // -//only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently -//Qualcomm CPU: Qualcomm Kryo CPU -//Qualcomm GPU: Qualcomm Adreno GPU -//Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) +// only focus on Qualcomm CPU/GPU/NPU backend in this implementation of QNN backend for ggml currently, +// CPU: Qualcomm Kryo CPU +// GPU: Qualcomm Adreno GPU +// NPU: Qualcomm NPU: aka HTP(Hexagon Tensor Processor), ~= cDSP(Compute DSP) + +// HMX(Hexagon Matrix eXtensions)/HTA(Hexagon Tensor Accelerator) static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - [QNN_BACKEND_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, - [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, - [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, .name = "qnn-npu", .lib = "libQnnHtp.so", .instance = nullptr, .backend = nullptr, .raw_interface = {}, .raw_system_interface = {}}, + [QNN_BACKEND_CPU] = {.device = 0, + .threads = 1, + .name = "qnn-cpu", + .lib = "libQnnCpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}}, + + [QNN_BACKEND_GPU] = {.device = 1, + .threads = 1, + .name = "qnn-gpu", + .lib = "libQnnGpu.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}}, + + [QNN_BACKEND_NPU] = {.device = 2, + .threads = 1, + .name = "qnn-npu", + .lib = "libQnnHtp.so", + .instance = nullptr, + .backend = nullptr, + .raw_interface = {}, + .raw_system_interface = {}}, }; // ================================================================================================= @@ -189,15 +223,14 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { // ================================================================================================= static inline int validate_tensor_version(Qnn_Tensor_t tensor) { if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, - tensor.version); + QNN_LOG_WARN( + "validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, tensor.version); return 1; } return 0; } - static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.id; @@ -206,7 +239,6 @@ static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { return 0u; } - static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.name; @@ -214,8 +246,6 @@ static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { return nullptr; } - - static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.type; @@ -223,31 +253,30 @@ static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { return QNN_TENSOR_TYPE_UNDEFINED; } - -static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { +static inline Qnn_TensorDataFormat_t + get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataFormat; } return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } - -static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { +static inline Qnn_DataType_t + get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dataType; } return QNN_DATATYPE_UNDEFINED; } - -static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { +static inline Qnn_QuantizeParams_t + get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.quantizeParams; } return QNN_QUANTIZE_PARAMS_INIT; } - static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.rank; @@ -255,7 +284,6 @@ static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { return 0u; } - static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.dimensions; @@ -263,7 +291,6 @@ static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) return nullptr; } - static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { if (tensor.version == QNN_TENSOR_VERSION_1) { return tensor.v1.memType; @@ -271,109 +298,95 @@ static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & te return QNN_TENSORMEMTYPE_UNDEFINED; } - static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.id = id; } } - static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.name = name; } } - static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.type = type; } } - static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataFormat = format; } } - static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dataType = dataType; } } - static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.quantizeParams = params; } } - static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.rank = rank; } } - static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.dimensions = dims; } } - -static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) { +static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t mem_type) { if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.memType = memType; + tensor.v1.memType = mem_type; } } - -static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) { +static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t client_buf) { if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.clientBuf = clientBuf; + tensor.v1.clientBuf = client_buf; } } - static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) { if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.memHandle = handle; } } +static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) { + if (!dst || !src || !dst_size || !copy_size) return 0; -static size_t memscpy(void * dst, size_t dstSize, const void * src, size_t copySize) { - if (!dst || !src || !dstSize || !copySize) - return 0; - - size_t minSize = dstSize < copySize ? dstSize : copySize; + size_t min_size = dst_size < copy_size ? dst_size : copy_size; - memcpy(dst, src, minSize); + memcpy(dst, src, min_size); - return minSize; + return min_size; } - static char * ggml_qnn_strndup(const char * source, size_t maxlen) { return ::strndup(source, maxlen); } - static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { int err = 0; VALIDATE_TENSOR_VERSION(src, err); dst.version = src.version; QNN_TENSOR_SET_NAME( - dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size())); - if (QNN_TENSOR_GET_NAME(dst) == nullptr) { + dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), + std::string(QNN_TENSOR_GET_NAME(src)).size())); + if (nullptr == QNN_TENSOR_GET_NAME(dst)) { return 1; } QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src)); @@ -382,8 +395,6 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src)); QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src)); - // Only metadata (i.e. non-static data) is copied from source to destination. The union still - // must be initialized so that the clientBuf/memHandle do not contain garbage data if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) { Qnn_ClientBuffer_t client_buf = {nullptr, 0}; QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf); @@ -393,48 +404,47 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return 1; } - Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); - Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; + Qnn_QuantizeParams_t src_qparam = QNN_TENSOR_GET_QUANT_PARAMS(src); + Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding; if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { - // need to allocate and copy memory for scaleOffset as it is a pointer array - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_AxisScaleOffset_t &axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = &axis_scale_offset.scaleOffset; - size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); - *scaleOffset = (Qnn_ScaleOffset_t *)malloc(scaleOffsetSize); - memscpy(*scaleOffset, - scaleOffsetSize, + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; + Qnn_ScaleOffset_t **scaleOffset = & axis_scale_offset.scaleOffset; + size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); + *scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize); + memscpy(*scaleOffset, scaleOffsetSize, src_qparam.axisScaleOffsetEncoding.scaleOffset, scaleOffsetSize); QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) { - // need to allocate and copy memory for scaleOffset as it is a pointer array - Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; - Qnn_BwAxisScaleOffset_t &bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; - size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); - float **scales = &bwaxis_scale_offset.scales; - int32_t **offsets = &bwaxis_scale_offset.offsets; - *scales = (float *)malloc(scaleSize); - memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, scaleSize); - - // only copy offsets if present, nullptr implies all offsets are 0 + Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; + Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding; + size_t scaleSize = bwaxis_scale_offset.numElements * sizeof(float); + float ** scales = &bwaxis_scale_offset.scales; + int32_t ** offsets = &bwaxis_scale_offset.offsets; + *scales = (float *) malloc(scaleSize); + memscpy(*scales, scaleSize, src_qparam.bwAxisScaleOffsetEncoding.scales, + scaleSize); + if (bwaxis_scale_offset.offsets != nullptr) { size_t offsetSize = bwaxis_scale_offset.numElements * sizeof(int32_t); - *offsets = (int32_t *)malloc(offsetSize); - memscpy(*offsets, offsetSize, src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); + *offsets = (int32_t *) malloc(offsetSize); + memscpy(*offsets, offsetSize, + src_qparam.bwAxisScaleOffsetEncoding.offsets, offsetSize); } QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy); } else { QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam); } - // allocate and copy memory for all the pointer members uint32_t rank = QNN_TENSOR_GET_RANK(src); QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t * dimensions = (uint32_t *)malloc(dim_size); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t *dimensions = (uint32_t *) malloc(dim_size); if (dimensions == nullptr) { - QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src)); + QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying " + "tensor %s\n", + QNN_TENSOR_GET_NAME(src)); return 1; } memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size); @@ -443,7 +453,6 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { return err; } - static int free_qnn_tensor(Qnn_Tensor_t & tensor) { int err = 0; VALIDATE_TENSOR_VERSION(tensor, err); @@ -454,7 +463,6 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { return err; } - static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -465,44 +473,40 @@ static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { return rank; } - -//TODO: mapping more ggml data type to QNN data type -//ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - default: - break; - + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + default: + break; } return QNN_DATATYPE_UNDEFINED; } - -//TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; } return nullptr; } - static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { /* size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); @@ -516,86 +520,85 @@ static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } - -template +template Fn load_qnn_functionpointers(void * handle, const char * function_name) { return reinterpret_cast(dlsym(handle, function_name)); } - static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { - case 0: - return "QNN-CPU"; - case 1: - return "QNN-GPU"; - case 2: - return "QNN-NPU"; - case 3: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - - default: - return "unknown"; + case 0: + return "QNN-CPU"; + case 1: + return "QNN-GPU"; + case 2: + return "QNN-NPU"; + case 3: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; } } - static intptr_t align_to(size_t alignment, intptr_t offset) { - return offset % alignment == 0 ? offset - : offset + - (static_cast(alignment) - - offset % static_cast(alignment)); + return offset % alignment == 0 + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); } - -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) { +static void ggml_qnn_log_internal(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...) { static std::mutex ggml_qnn_log_internal_mutex; - static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; + static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; { std::lock_guard lock(ggml_qnn_log_internal_mutex); - va_list args; + va_list args; + va_start(args, format); - int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); - int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); + int len_prefix = + snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, + "[%s, %d]: ", func, line); + int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, + GGML_QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { #if (defined __ANDROID__) || (defined ANDROID) - //for Android APK + // for Android APK __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf); #endif - //for Android command line application or WoA + // for Android command line application or WoA printf("%s\n", s_ggml_qnn_log_internal_buf); } va_end(args); } } - // ================================================================================================= // -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI +// Engine Direct) SDK // // ================================================================================================= class qnn_interface { -#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } - +#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } -#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ - template \ - inline auto qnn_##F(Args... args) const { \ - return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ - std::forward(args)...); \ - } +#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name) \ + template inline auto qnn_##F(Args... args) const { \ + return ( \ + _qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \ + std::forward(args)...); \ + } friend class qnn_instance; -public: + public: qnn_interface() = default; // QnnBackend @@ -603,31 +606,38 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, + backendRegisterOpPackage); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, + backendValidateOpConfig); - DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion); + DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, + backendGetApiVersion); // QnnDevice DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate); DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, + deviceGetInfrastructure); - DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo); + DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, + deviceGetPlatformInfo); DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo); // QnnContext DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate); - DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize); + DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, + contextGetBinarySize); DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary); - DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary); + DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, + contextCreateFromBinary); DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree); @@ -666,17 +676,22 @@ class qnn_interface { DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister); // QnnProperty - DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability); + DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, + propertyHasCapability); // QnnTensor - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor); + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, + tensorCreateContextTensor); - DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor); + DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, + tensorCreateGraphTensor); // QnnSystem - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate); + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, + systemContextCreate); - DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo); + DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, + systemContextGetBinaryInfo); DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree); @@ -684,67 +699,60 @@ class qnn_interface { _qnn_interface = qnn_interface; } - void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) { + void set_qnn_system_interface( + const QnnSystemInterface_t * qnn_sys_interface) { _qnn_sys_interface = qnn_sys_interface; } - uint32_t get_backend_id() const { - return _qnn_interface->backendId; - } + uint32_t get_backend_id() const { return _qnn_interface->backendId; } bool is_loaded() const { return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr)); } -private: + private: const QnnInterface_t * _qnn_interface = nullptr; const QnnSystemInterface_t * _qnn_sys_interface = nullptr; }; - - // ================================================================================================= // // wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK // -// and -// -// resource management of QNN resources for GGML's QNN backend // ================================================================================================= class qnn_instance { -public: + public: using BackendIdType = decltype(QnnInterface_t{}.backendId); - explicit qnn_instance(const std::string & lib_path, const std::string & backend_name, - const std::string & model_name) : - _lib_path(std::move(lib_path)), - _backend_name(std::move(backend_name)), - _model_name(std::move(model_name)) {}; + explicit qnn_instance(const std::string & lib_path, + const std::string & backend_name, + const std::string & model_name) + : _lib_path(std::move(lib_path)) + , _backend_name(std::move(backend_name)) + , _model_name(std::move(model_name)){}; - ~qnn_instance() { - } + ~qnn_instance() {} int qnn_init(const QnnSaver_Config_t ** saver_config); int qnn_finalize(); - const qnn_interface &get_qnn_interface() { + const qnn_interface & get_qnn_interface() { if (!_qnn_interface.is_loaded()) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_interface; } - - const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() { + const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { if (!_qnn_interface.is_loaded()) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } return _qnn_raw_interface; } - const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() { + const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { if (!_qnn_interface.is_loaded()) { QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); } @@ -753,24 +761,31 @@ class qnn_instance { const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } - const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; } + const Qnn_ProfileHandle_t get_qnn_profile_handle() { + return _qnn_profile_handle; + } - const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; } + const Qnn_DeviceHandle_t get_qnn_device_handle() { + return _qnn_device_handle; + } - const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; } + const Qnn_BackendHandle_t get_qnn_backend_handle() { + return _qnn_backend_handle; + } - const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; } + const Qnn_ContextHandle_t get_qnn_context_handle() { + return _qnn_context_handle; + } - const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; } + const QnnSystemContext_Handle_t get_qnn_system_handle() { + return _qnn_system_handle; + } const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - - int init_qnn_graph(const char * graph_name, - bool debug, - uint8_t do_node_validation = 1, - const QnnGraph_Config_t ** graph_configs = nullptr - ); + int init_qnn_graph(const char * graph_name, bool debug, + uint8_t do_node_validation = 1, + const QnnGraph_Config_t ** graph_configs = nullptr); int finalize_qnn_graph(); @@ -782,35 +797,35 @@ class qnn_instance { return 1; } - QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); - QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra; + QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); + QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra; uint32_t power_configid = 1; - uint32_t device_id = 0; - uint32_t core_id = 0; + uint32_t device_id = 0; + uint32_t core_id = 0; htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); - _qnn_htp_perfinfra = htp_perfinfra; + _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; return 0; } - int set_rpc_polling() { if (_qnn_rpc_pollingtime > 0) { QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); rpc_pollingTime.option = - QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&rpc_pollingTime, nullptr}; + const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = { + &rpc_pollingTime, nullptr}; if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, + powerConfigs); } } return 0; } - int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { QNN_LOG_DEBUG("perf intra is null\n"); @@ -820,39 +835,49 @@ class qnn_instance { QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; memset(&powerConfig, 0, sizeof(powerConfig)); powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - powerConfig.dcvsV3Config.dcvsEnable = 0; + powerConfig.dcvsV3Config.dcvsEnable = 0; powerConfig.dcvsV3Config.setDcvsEnable = 1; - powerConfig.dcvsV3Config.contextId = _qnn_power_configid; + powerConfig.dcvsV3Config.contextId = _qnn_power_configid; powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - powerConfig.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False - powerConfig.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False - powerConfig.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False - powerConfig.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable - powerConfig.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False - // set Sleep latency parameter + powerConfig.dcvsV3Config.setSleepLatency = + 1; // true to consider Latency parameter otherwise False + powerConfig.dcvsV3Config.setBusParams = + 1; // true to consider Bus parameter otherwise False + powerConfig.dcvsV3Config.setCoreParams = + 1; // true to consider Core parameter otherwise False + powerConfig.dcvsV3Config.sleepDisable = + 0; // true to consider sleep/LPM modes, False to enable + powerConfig.dcvsV3Config.setSleepDisable = + 0; // true to consider sleep disable/enable parameter otherwise False set sleep latency parameter uint32_t latencyValue = 40; - powerConfig.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec - // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) - powerConfig.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum) - powerConfig.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.sleepLatency = + latencyValue; // range 40-2000 micro sec + // set Bus Clock Parameters + powerConfig.dcvsV3Config.busVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.busVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set Core Clock Parameters + powerConfig.dcvsV3Config.coreVoltageCornerMin = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerTarget = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + powerConfig.dcvsV3Config.coreVoltageCornerMax = + DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = {&powerConfig, nullptr}; + const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = { + &powerConfig, nullptr}; _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); return 0; } - std::string & get_qnn_graph_name() { return _graph_name; } + std::string &get_qnn_graph_name() { return _graph_name; } - bool is_rpcmem_initialized() { - return _rpcmem_initialized; - } + bool is_rpcmem_initialized() { return _rpcmem_initialized; } void set_rpcmem_initialized(bool initialized) { _rpcmem_initialized = initialized; @@ -864,7 +889,7 @@ class qnn_instance { void unregister_rpcmem(); - void *alloc_rpcmem(size_t bytes, size_t alignment); + void * alloc_rpcmem(size_t bytes, size_t alignment); void free_rpcmem(void * buf); @@ -874,15 +899,17 @@ class qnn_instance { return _qnn_mem_set.count(handle) != 0U; } -public: - std::map> _qnn_graph_map; + public: + std::map> + _qnn_graph_map; -private: + private: int load_system(); int unload_system(); - int load_backend(std::string &lib_path, const QnnSaver_Config_t ** saver_config); + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config); int unload_backend(); @@ -890,24 +917,25 @@ class qnn_instance { _qnn_raw_interface = raw_interface; } - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE &raw_interface) { + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { _qnn_raw_system_interface = raw_interface; } -private: + private: static constexpr const int _required_num_providers = 1; -private: - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // prebuilt QNN model name, not used in currently + private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used currently BackendIdType _backend_id; - bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode - bool _do_node_validations = true; // flag to indicate whether all add_node calls need to be validated - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; + bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode + bool _do_node_validations = true; // flag to indicate whether all add_node + // calls need to be validated + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; + ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; qnn_interface _qnn_interface; @@ -927,36 +955,35 @@ class qnn_instance { QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; std::unordered_set _qnn_mem_set; - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; std::unordered_map _lib_path_to_backend_id; std::unordered_map _loaded_backend; - void * _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - pfn_rpc_mem_free _pfn_rpc_mem_free; - pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - pfn_rpc_mem_init _pfn_rpc_mem_init; - pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; + void * _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; std::string _graph_name; }; - // ================================================================================================= // -// implementation of wrapper class +// implementation of QNN wrapper class // // ================================================================================================= void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { @@ -965,15 +992,18 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { return nullptr; } - auto allocate_bytes = static_cast(bytes + alignment); - void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); return nullptr; } - auto aligned_buf = reinterpret_cast(align_to(alignment,reinterpret_cast(buf))); - bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + auto aligned_buf = reinterpret_cast( + align_to(alignment, reinterpret_cast(buf))); + bool status = + _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); _pfn_rpc_mem_free(buf); @@ -982,7 +1012,6 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { return aligned_buf; } - void qnn_instance::free_rpcmem(void * buf) { if (!_rpcmem_initialized) { QNN_LOG_WARN("rpc memory not initialized\n"); @@ -994,7 +1023,6 @@ void qnn_instance::free_rpcmem(void * buf) { } } - int32_t qnn_instance::rpcmem_to_fd(void * buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { @@ -1006,7 +1034,6 @@ int32_t qnn_instance::rpcmem_to_fd(void * buf) { return mem_fd; } - int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { if (nullptr == p_data || (nullptr == p_tensor)) { QNN_LOG_WARN("invalid param\n"); @@ -1020,10 +1047,11 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { if (is_rpcmem_allocated(p_data)) { QNN_LOG_WARN("rpc memory already allocated\n"); - //return 3; + // return 3; } if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + QNN_LOG_WARN("tensor %s has been registered shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); return 4; } @@ -1033,24 +1061,23 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { return 5; } QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = { - {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}}}; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register( - _qnn_context_handle, - &descriptor, - /*numDescriptors=*/1, - &handle); + Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, + QNN_VER_PTR(*p_tensor)->dimensions, + nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), - strerror(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", + QNN_GET_ERROR_CODE(error), strerror(error)); return 6; } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); + QNN_LOG_INFO("tensor %s successfully register shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); } QNN_VER_PTR(*p_tensor)->memHandle = handle; _qnn_mem_set.insert(handle); @@ -1058,7 +1085,6 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { return 0; } - void qnn_instance::unregister_rpcmem() { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1069,47 +1095,49 @@ void qnn_instance::unregister_rpcmem() { for (auto &mem_handle : _qnn_mem_set) { error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", + QNN_GET_ERROR_CODE(error)); } } _qnn_mem_set.clear(); } - bool qnn_instance::is_rpcmem_allocated(void * buf) { return _rpcmem_store_map.count(buf) != 0U; } - -int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { +int qnn_instance::load_backend(std::string & lib_path, + const QnnSaver_Config_t ** saver_config) { Qnn_ErrorHandle_t error = QNN_SUCCESS; QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, with error: %s", + lib_path.c_str(), dlerror()); return 1; } - // load get_provider function - auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle, - "QnnInterface_getProviders"); + auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>( + lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", + dlerror()); return 2; } - // get QnnInterface Providers std::uint32_t num_providers = 0; const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", + QNN_GET_ERROR_CODE(error)); return 3; } QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, + _required_num_providers); return 4; } @@ -1120,10 +1148,12 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * bool found_valid_interface = false; QNN_INTERFACE_VER_TYPE qnn_interface; for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) { + if (QNN_API_VERSION_MAJOR == + provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= + provider_list[idx]->apiVersion.coreApiVersion.minor) { found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; break; } } @@ -1136,33 +1166,34 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t * } set_qnn_raw_interface(qnn_interface); - BackendIdType backend_id = provider_list[0]->backendId; + BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); + lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); + QNN_LOG_WARN("fail to close %p with error %s\n", + _loaded_lib_handle[backend_id], dlerror()); } } _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; + _backend_id = backend_id; return 0; } - int qnn_instance::unload_backend() { int dlclose_error = 0; - for (auto &it : _loaded_lib_handle) { + for (auto & it : _loaded_lib_handle) { dlclose_error = dlclose(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, + dlerror()); } } @@ -1173,7 +1204,6 @@ int qnn_instance::unload_backend() { return 0; } - int qnn_instance::load_system() { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1182,14 +1212,18 @@ int qnn_instance::load_system() { _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", + system_lib_path.c_str(), dlerror()); return 1; } - auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym( - _system_lib_handle, "QnnSystemInterface_getProviders")); + auto * get_providers = + reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror()); + QNN_LOG_WARN( + "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", + dlerror()); return 2; } @@ -1197,12 +1231,14 @@ int qnn_instance::load_system() { const QnnSystemInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d\n", + QNN_GET_ERROR_CODE(error)); return 3; } if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, + _required_num_providers); return 4; } @@ -1215,11 +1251,12 @@ int qnn_instance::load_system() { bool found_valid_system_interface = false; for (size_t idx = 0; idx < num_providers; idx++) { if (QNN_SYSTEM_API_VERSION_MAJOR == - provider_list[idx]->systemApiVersion.major && + provider_list[idx]->systemApiVersion.major && QNN_SYSTEM_API_VERSION_MINOR <= - provider_list[idx]->systemApiVersion.minor) { + provider_list[idx]->systemApiVersion.minor) { found_valid_system_interface = true; - qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + qnn_system_interface = + provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; break; } } @@ -1243,7 +1280,6 @@ int qnn_instance::load_system() { return 0; } - int qnn_instance::unload_system() { int result = 0; @@ -1262,7 +1298,8 @@ int qnn_instance::unload_system() { int dlclose_error = dlclose(_system_lib_handle); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", + dlerror()); return 2; } @@ -1271,36 +1308,33 @@ int qnn_instance::unload_system() { return result; } +static void ggml_qnn_logcallback(const char * fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp) { -static void ggml_qnn_logcallback(const char * fmt, - QnnLog_Level_t level, - uint64_t timestamp, - va_list argp) { - -#if GGML_QNN_DEBUG - static std::mutex log_mutex; +#if ENABLE_QNN_LOG + static std::mutex log_mutex; static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; const char * log_level_desc = ""; switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = " ERROR "; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = " INFO "; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = " DEBUG "; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; } double ms = (double) timestamp / 1000000.0; @@ -1314,12 +1348,11 @@ static void ggml_qnn_logcallback(const char * fmt, #endif } - int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; QNN_LOG_DEBUG("enter qni_init\n"); - const std::lock_guard lock(_init_mutex); + std::lock_guard lock(_init_mutex); if (0 != load_system()) { QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); @@ -1328,39 +1361,43 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_DEBUG("load QNN system lib successfully\n"); } - std::string bakend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) { - int is_load_ok = load_backend(bakend_lib_path, saver_config); + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); if (0 != is_load_ok) { QNN_LOG_WARN("failed to load QNN backend\n"); return 2; } } - backend_id = _lib_path_to_backend_id[bakend_lib_path]; + backend_id = _lib_path_to_backend_id[backend_lib_path]; if (0 == _loaded_backend.count(backend_id) || 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n", - bakend_lib_path.c_str(), - _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); return 3; } _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle); + _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, + &_qnn_log_handle); if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone + QNN_LOG_WARN( + "why failed to initialize qnn log\n"); // NPU backend not work on + // Qualcomm SoC equipped low-end phone return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); } std::vector temp_backend_config; - _qnn_interface.qnn_backend_create(_qnn_log_handle, temp_backend_config.empty() ? nullptr - : temp_backend_config.data(), - &_qnn_backend_handle); + _qnn_interface.qnn_backend_create( + _qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); if (nullptr == _qnn_backend_handle) { QNN_LOG_WARN("why failed to initialize qnn backend\n"); return 5; @@ -1369,7 +1406,8 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } if (nullptr != _qnn_raw_interface.propertyHasCapability) { - auto qnnStatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + auto qnnStatus = + _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { QNN_LOG_WARN("device property is not supported\n"); } @@ -1378,8 +1416,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } } - auto qnnStatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); - if (QNN_SUCCESS != qnnStatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnStatus) { + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, + &_qnn_device_handle); + if (QNN_SUCCESS != qnn_status && + QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { QNN_LOG_WARN("failed to create QNN device\n"); } else { QNN_LOG_INFO("create device successfully\n"); @@ -1389,8 +1429,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); if (ggml_qnn_profile_level::profile_basic == _profile_level) { QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( - _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) { + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_BASIC, + &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 6; } else { @@ -1398,8 +1440,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { } } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate( - _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) { + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { QNN_LOG_WARN("unable to create profile handle in the backend\n"); return 7; } else { @@ -1416,26 +1460,32 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); } - _pfn_rpc_mem_init = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast(dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast(dlsym(_rpc_lib_handle,"rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free - || nullptr == _pfn_rpc_mem_to_fd) { + _pfn_rpc_mem_init = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_init")); + _pfn_rpc_mem_deinit = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_deinit")); + _pfn_rpc_mem_alloc = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_alloc")); + _pfn_rpc_mem_free = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_free")); + _pfn_rpc_mem_to_fd = reinterpret_cast( + dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || + nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); return 9; } - if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy + if (nullptr != + _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_init(); std::vector temp_context_config; - _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr - : temp_context_config.data(), - &_qnn_context_handle); + _qnn_interface.qnn_context_create( + _qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr : temp_context_config.data(), + &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); return 10; @@ -1448,12 +1498,12 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { return 0; } - int qnn_instance::qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC based low-end phone happy + if (nullptr != + _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_deinit(); if (dlclose(_rpc_lib_handle) != 0) { @@ -1463,11 +1513,12 @@ int qnn_instance::qnn_finalize() { } if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); + error = _qnn_interface.qnn_context_free(_qnn_context_handle, + _qnn_profile_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); - + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; } @@ -1476,8 +1527,8 @@ int qnn_instance::qnn_finalize() { error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); - + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; } @@ -1486,8 +1537,8 @@ int qnn_instance::qnn_finalize() { error = _qnn_interface.qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); - + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; } @@ -1496,17 +1547,18 @@ int qnn_instance::qnn_finalize() { error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; - } if (nullptr != _qnn_log_handle) { error = _qnn_interface.qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error)); + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; } @@ -1518,9 +1570,9 @@ int qnn_instance::qnn_finalize() { return ret_status; } - -int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation, - const QnnGraph_Config_t ** graph_configs) { +int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, + uint8_t do_node_validation, + const QnnGraph_Config_t ** graph_configs) { int result = 0; if (nullptr == graph_name) { @@ -1534,15 +1586,16 @@ int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do } if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n"); + QNN_LOG_WARN("node validation disabled, backend will not perform op " + "validation prior to adding node\n"); } - _graph_name = graph_name; - _debug_tensor = debug; + _graph_name = graph_name; + _debug_tensor = debug; _do_node_validations = do_node_validation; - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, graph_configs, - &_qnn_graph_handle); + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, + graph_configs, &_qnn_graph_handle); if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { QNN_LOG_WARN("failed to create graph in qnn context\n"); return 3; @@ -1553,13 +1606,12 @@ int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do return 0; } - int qnn_instance::finalize_qnn_graph() { if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, _qnn_profile_handle, nullptr) != - QNN_GRAPH_NO_ERROR) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, + nullptr) != QNN_GRAPH_NO_ERROR) { QNN_LOG_WARN("finalizing graph failure\n"); - //return 1; } } else { QNN_LOG_DEBUG("qnn graph handle is null\n"); @@ -1568,26 +1620,28 @@ int qnn_instance::finalize_qnn_graph() { return 0; } - - // ================================================================================================= // // implementation of GGML's QNN backend // // ================================================================================================= -static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - if (nullptr == tensor) - return false; - if (b_dump_tensor_info) { - QNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), - ggml_type_name(tensor->type)); - } - //only support the following 3 OPs currently and ensure tensor->src[0] and tensor->src[1] is not nullptr - bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, + const struct ggml_tensor *tensor, + bool b_dump_tensor_info) { + // only support the following 3 OPs currently + // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend + // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends + // which the backend's ggml_backend_xxx_buffer_is_host return true. + // this approach could be found: + // https://github.com/ggerganov/llama.cpp/pull/7641 + // + // ensure tensor->src[0] and tensor->src[1] is not nullptr. + bool supported_op = + ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || + (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } - const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; @@ -1597,87 +1651,114 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; + const int64_t ne20 = tensor->ne[0]; + const int64_t ne21 = tensor->ne[1]; - GGML_UNUSED(ne0); - GGML_UNUSED(ne1); + //TODO: support other quatinized data type + if (ggml_is_quantized(src0->type) && (src0->type != GGML_TYPE_Q8_0)) { + return false; + } if (b_dump_tensor_info) { - QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); - QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); - if (tensor->op == GGML_OP_MUL_MAT) { - QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); - QNN_LOG_DEBUG( - "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG( - "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG( - " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, - tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], - tensor->nb[0], - tensor->nb[1], tensor->nb[2]); - - } - } - - if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); + QNN_LOG_DEBUG("op name:%s, tensor type:%s", + ggml_op_name(tensor->op), + ggml_type_name(tensor->type)); + QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); + QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); + QNN_LOG_DEBUG("src0 %15s: type = %i (%5s) ne = %5" PRIi64 + " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("src1 %15s: type = %i (%5s) ne = %5" PRIi64 + " x %5" PRIi64 " x %5" PRIi64 + ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG( + " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + tensor->name, tensor->type, ggml_type_name(tensor->type), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], + tensor->nb[1], tensor->nb[2]); + } + } + + if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || + tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || + tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { return false; } - //make ggml_get_tensor_rank and QNN SDK happy + // make ggml_get_tensor_rank and QNN SDK happy if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { return false; } - // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size - if (tensor->ne[1] < 32) { + if ((ne20 < 32) || (ne21 < 32) || (ne10 < 32)) { return false; } int qtype = src0->type; - return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || qtype == GGML_TYPE_Q8_0) - && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); - -} + if (tensor->op == GGML_OP_ADD) { + return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || + qtype == GGML_TYPE_Q8_0) && + (src1->type == GGML_TYPE_F32); + } + if (tensor->op == GGML_OP_MUL) { + return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); + } -static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; + if (tensor->op == GGML_OP_MUL_MAT) { + if (ctx->device == QNN_BACKEND_GGML) { + return (ne00 == ne10) && (src1->ne[2] % src0->ne[2] == 0) && + (src1->ne[3] % src0->ne[3] == 0); + } + if ((ctx->device == QNN_BACKEND_NPU) && (qtype == GGML_TYPE_Q8_0) && + (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32)) { + return true; + } + if (ctx->device == QNN_BACKEND_CPU || ctx->device == QNN_BACKEND_GPU) { + return (ne00 == ne10) && (ne00 == ne01); + } + return false; + } +} - qnn_instance * instance = nullptr; +static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("pls check why GGML tensor is null"); return; } - tensor_0 = (Qnn_Tensor_t *)src0->extra; - tensor_1 = (Qnn_Tensor_t *)src1->extra; - tensor_2 = (Qnn_Tensor_t *)dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || + (nullptr == tensor_2)) { QNN_LOG_WARN("pls check why QNN tensor is null"); return; } @@ -1685,53 +1766,63 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_WARN("pls check why backend ctx is null"); return; } - instance = ctx->instance; + instance = ctx->instance; if (nullptr == instance) { QNN_LOG_WARN("pls check why qnn instance is null"); return; } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - n_begin_time = ggml_time_us(); - - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + + n_begin_time = ggml_time_us(); + + if (0) { + QNN_LOG_DEBUG("call %s\n", __func__); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), + dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + } QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], + (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); + graph_handle = std::get<0>(graph_item); } uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; @@ -1739,15 +1830,16 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + src0->name + "_" + src1->name; QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - //QnnGraph_Config_t graph_config; - //graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - //graph_config.customConfig = strdup(graph_name.c_str()); - //const QnnGraph_Config_t * p_graph_config = &graph_config; - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); return; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); @@ -1763,40 +1855,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, .v1 = { - "ggml_op_add", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, - 0, - qnn_params, - 2, - tensor_inputs, - 1, - tensor_outputs - } - }; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, + 2, tensor_inputs, 1, + tensor_outputs}}; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -1805,49 +1888,57 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + error = + qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); - - //QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); + + // QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], + // src0->ne[2], src0->ne[3]); + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + error = + qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + tensor_outputs,1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } @@ -1855,52 +1946,54 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_duration); } - - /* * ggml_qnn_mul_mat was re-added as a standalone function because * the following comments came from https://github.com/ggerganov/llama.cpp/pull/1632 - * MUL_MAT take most of the compute time (about 95%). So to speed up llama, we have to focus on MUL_MAT. + * MUL_MAT take most of the compute time (about 95%). + * So to speed up llama, we have to focus on MUL_MAT. + * * We have three kinds of MUL_MAT to compute: - * mul_mat_f32: both src0 and src1 are F32. + * mul_mat_f32: both src0 and src1 are F32. * mul_mat_f16_f32: src0 is F16 and src1 is F32. - * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. -*/ -static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - - Qnn_Param_t qnn_params[] = {}; - - enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...), and src1 is F32. + */ +static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("pls check why GGML tensor is null"); return; } - tensor_0 = (Qnn_Tensor_t *)src0->extra; - tensor_1 = (Qnn_Tensor_t *)src1->extra; - tensor_2 = (Qnn_Tensor_t *)dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || + (nullptr == tensor_2)) { QNN_LOG_WARN("pls check why QNN tensor is null"); return; } @@ -1908,28 +2001,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_LOG_WARN("pls check why backend ctx is null"); return; } - instance = ctx->instance; + instance = ctx->instance; if (nullptr == instance) { QNN_LOG_WARN("pls check why qnn instance is null"); return; } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - n_begin_time = ggml_time_us(); + n_begin_time = ggml_time_us(); QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); @@ -1938,22 +2034,26 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], + (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); + graph_handle = std::get<0>(graph_item); } uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; @@ -1961,11 +2061,16 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { - graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + graph_name = graph_name + "_" + std::to_string(ctx->threads) + + src0->name + "_" + src1->name; QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error); + QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " + "error = %d\n", + graph_name.c_str(), error); return; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); @@ -1981,40 +2086,30 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_LOG_INFO("error = %d\n", error); } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, .v1 = { - "ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, - 0, - qnn_params, - 2, - tensor_inputs, - 1, - tensor_outputs - } - }; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, 0, qnn_params, 2, + tensor_inputs, 1, tensor_outputs}}; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2023,48 +2118,56 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + auto & graph_item= instance->_qnn_graph_map[map_entry]; + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + error = + qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } @@ -2072,45 +2175,48 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, const ggml_tensor * QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", n_duration); + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", + n_duration); QNN_LOG_DEBUG("call %s done\n", __func__); } - -//common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_op ggmlop, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - - std::string qnn_graph_name = "ggml_qnn_graph"; - std::string qnn_op_config_name = "ggml_qnn_op_config"; - const char * qnn_op_name = nullptr; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - - Qnn_Param_t qnn_params[] = {}; - - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; +// common function for GGML OPs using QNN API +static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, + const enum ggml_op ggmlop, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + bool graph_initialized = false; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_duration = 0LL; + + qnn_instance * instance = nullptr; + std::string qnn_graph_name = "ggml_qnn_graph"; + std::string qnn_op_config_name = "ggml_qnn_op_config"; + const char * qnn_op_name = nullptr; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + + Qnn_Param_t qnn_params[] = {}; + + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { QNN_LOG_WARN("pls check why GGML tensor is null"); return; } - tensor_0 = (Qnn_Tensor_t *)src0->extra; - tensor_1 = (Qnn_Tensor_t *)src1->extra; - tensor_2 = (Qnn_Tensor_t *)dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + if ((nullptr == tensor_0) || (nullptr == tensor_1) || + (nullptr == tensor_2)) { QNN_LOG_WARN("pls check why QNN tensor is null"); return; } @@ -2118,58 +2224,66 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o QNN_LOG_WARN("pls check why backend ctx is null"); return; } - instance = ctx->instance; + instance = ctx->instance; if (nullptr == instance) { QNN_LOG_WARN("pls check why qnn instance is null"); return; } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); + src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); + dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); if (nullptr == qnn_op_name) { - QNN_LOG_WARN("pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, ggml_op_name(ggmlop)); + QNN_LOG_WARN( + "pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, + ggml_op_name(ggmlop)); return; } - n_begin_time = ggml_time_us(); + n_begin_time = ggml_time_us(); QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, - src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], - src0->nb[0], src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, - src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], - src1->nb[0], src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, - dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) { + QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; + QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], + (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], + (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; + + std::string map_entry = std::string(ggml_op_name(ggmlop)); + if (instance->_qnn_graph_map.find(map_entry) != + instance->_qnn_graph_map.end()) { graph_initialized = true; auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); + graph_handle = std::get<0>(graph_item); } uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; @@ -2177,13 +2291,21 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; if (!graph_initialized) { - qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + std::to_string(ctx->threads) + src0->name + "_" + src1->name; + qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + + std::to_string(ctx->threads) + src0->name + "_" + + src1->name; + qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + + std::to_string(ctx->threads) + src0->name + "_" + + src1->name; QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); - error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, &graph_handle); + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, + &graph_handle); if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); + QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " + "name %s, error = %d\n", + ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); return; } @@ -2200,40 +2322,30 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o QNN_LOG_INFO("error = %d\n", error); } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - Qnn_OpConfig_t op_config = { - (Qnn_OpConfigVersion_t) 1, .v1 = { - qnn_op_config_name.c_str(), - QNN_OP_PACKAGE_NAME_QTI_AISW, - qnn_op_name, - 0, - qnn_params, - 2, - tensor_inputs, - 1, - tensor_outputs - } - }; + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, + .v1 = {qnn_op_config_name.c_str(), + QNN_OP_PACKAGE_NAME_QTI_AISW, + qnn_op_name, 0, qnn_params, 2, + tensor_inputs, 1, tensor_outputs}}; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2242,48 +2354,56 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); + graph_handle = std::get<0>(graph_item); + tensor_0 = std::get<1>(graph_item); + tensor_1 = std::get<2>(graph_item); + tensor_2 = std::get<3>(graph_item); QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], - (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; + uint32_t dimensions_input_0[] = { + (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], + (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; + uint32_t dimensions_input_1[] = { + (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], + (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; + uint32_t dimensions_output[] = { + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], + (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; + QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; + QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)}; - - Qnn_Tensor_t tensor_inputs[] = { - *tensor_0, - *tensor_1 - }; - Qnn_Tensor_t tensor_outputs[] = { - *tensor_2 - }; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + ggml_get_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + ggml_get_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + ggml_get_tensor_data_size(dst)}; + + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; + Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; + error = + qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, + tensor_outputs, 1, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); } @@ -2291,381 +2411,310 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, const enum ggml_o QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", ggml_op_name(ggmlop), n_duration); + n_end_time = ggml_time_us(); + n_duration = (n_end_time - n_begin_time) / 1000; + QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", + ggml_op_name(ggmlop), n_duration); QNN_LOG_DEBUG("call %s done\n", __func__); } - -static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_div(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_gelu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_silu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_gelu_quick(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_tanh(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_relu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_hardsigmoid(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_hardswish(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_upscale(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { ggml_qnn_cpy(ctx, src0, dst, nullptr); (void) src1; } - static void ggml_qnn_mul_mat_id(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_diag_mask_inf(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { } - -static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_soft_max(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); - } - -static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { } - -static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); } - -static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); } - -static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - (void) src0; - (void) src1; - (void) dst; - QNN_LOG_DEBUG("call %s\n", __func__); - - QNN_LOG_DEBUG("call %s done\n", __func__); +static void ggml_qnn_nop(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + (void)src0; + (void)src1; + (void)dst; } - -bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { - ggml_qnn_func_t func = nullptr; - ggml_qnn_func_common_t func_common = nullptr; +bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, + struct ggml_compute_params * params, + struct ggml_tensor * tensor) { + ggml_qnn_func_t func = nullptr; + ggml_qnn_func_common_t func_common = nullptr; switch (tensor->op) { - case GGML_OP_ADD: - func = ggml_qnn_add; + case GGML_OP_ADD: + func = ggml_qnn_add; + break; + + case GGML_OP_MUL: + func_common = ggml_qnn_hanlde_op; + break; + + case GGML_OP_MUL_MAT: + func = ggml_qnn_mul_mat; + break; + + case GGML_OP_REPEAT: + func = ggml_qnn_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_qnn_get_rows; + break; + case GGML_OP_DUP: + func = ggml_qnn_dup; + break; + + case GGML_OP_ACC: + func = ggml_qnn_acc; + break; + + case GGML_OP_DIV: + func = ggml_qnn_div; + break; + + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_qnn_gelu; break; - - case GGML_OP_MUL: - func_common = ggml_qnn_hanlde_op; + case GGML_UNARY_OP_SILU: + func = ggml_qnn_silu; break; - - case GGML_OP_MUL_MAT: - func = ggml_qnn_mul_mat; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_qnn_gelu_quick; break; - - case GGML_OP_REPEAT: - func = ggml_qnn_repeat; + case GGML_UNARY_OP_TANH: + func = ggml_qnn_tanh; break; - case GGML_OP_GET_ROWS: - func = ggml_qnn_get_rows; + case GGML_UNARY_OP_RELU: + func = ggml_qnn_relu; break; - case GGML_OP_DUP: - func = ggml_qnn_dup; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_qnn_hardsigmoid; break; - - case GGML_OP_ACC: - func = ggml_qnn_acc; - break; - - case GGML_OP_DIV: - func = ggml_qnn_div; - break; - - case GGML_OP_UNARY: - switch (ggml_get_unary_op(tensor)) { - case GGML_UNARY_OP_GELU: - func = ggml_qnn_gelu; - break; - case GGML_UNARY_OP_SILU: - func = ggml_qnn_silu; - break; - case GGML_UNARY_OP_GELU_QUICK: - func = ggml_qnn_gelu_quick; - break; - case GGML_UNARY_OP_TANH: - func = ggml_qnn_tanh; - break; - case GGML_UNARY_OP_RELU: - func = ggml_qnn_relu; - break; - case GGML_UNARY_OP_HARDSIGMOID: - func = ggml_qnn_hardsigmoid; - break; - case GGML_UNARY_OP_HARDSWISH: - func = ggml_qnn_hardswish; - break; - default: - return false; - } - break; - case GGML_OP_NORM: - func = ggml_qnn_norm; - break; - case GGML_OP_GROUP_NORM: - func = ggml_qnn_group_norm; - break; - case GGML_OP_CONCAT: - func = ggml_qnn_concat; - break; - case GGML_OP_UPSCALE: - func = ggml_qnn_upscale; - break; - case GGML_OP_PAD: - func = ggml_qnn_pad; - break; - case GGML_OP_LEAKY_RELU: - func = ggml_qnn_leaky_relu; - break; - case GGML_OP_RMS_NORM: - func = ggml_qnn_rms_norm; - break; - case GGML_OP_MUL_MAT_ID: - func = ggml_qnn_mul_mat_id; - break; - case GGML_OP_SCALE: - func = ggml_qnn_scale; - break; - case GGML_OP_SQR: - func = ggml_qnn_sqr; - break; - case GGML_OP_CLAMP: - func = ggml_qnn_clamp; - break; - case GGML_OP_CPY: - func = ggml_qnn_cpy; - break; - case GGML_OP_CONT: - func = ggml_qnn_dup; - break; - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - func = ggml_qnn_nop; - break; - case GGML_OP_DIAG_MASK_INF: - func = ggml_qnn_diag_mask_inf; - break; - case GGML_OP_SOFT_MAX: - func = ggml_qnn_soft_max; - break; - case GGML_OP_ROPE: - func = ggml_qnn_rope; - break; - case GGML_OP_IM2COL: - func = ggml_qnn_im2col; - break; - case GGML_OP_POOL_2D: - func = ggml_qnn_pool2d; - break; - case GGML_OP_SUM_ROWS: - func = ggml_qnn_sum_rows; - break; - case GGML_OP_ARGSORT: - func = ggml_qnn_argsort; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_qnn_hardswish; break; default: return false; + } + break; + case GGML_OP_NORM: + func = ggml_qnn_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_qnn_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_qnn_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_qnn_upscale; + break; + case GGML_OP_PAD: + func = ggml_qnn_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_qnn_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_qnn_rms_norm; + break; + case GGML_OP_MUL_MAT_ID: + func = ggml_qnn_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_qnn_scale; + break; + case GGML_OP_SQR: + func = ggml_qnn_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_qnn_clamp; + break; + case GGML_OP_CPY: + func = ggml_qnn_cpy; + break; + case GGML_OP_CONT: + func = ggml_qnn_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_qnn_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_qnn_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_qnn_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_qnn_rope; + break; + case GGML_OP_IM2COL: + func = ggml_qnn_im2col; + break; + case GGML_OP_POOL_2D: + func = ggml_qnn_pool2d; + break; + case GGML_OP_SUM_ROWS: + func = ggml_qnn_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_qnn_argsort; + break; + default: + return false; } - if (nullptr != func) - func(ctx, tensor->src[0], tensor->src[1], tensor); + if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor); if (nullptr != func_common) func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); @@ -2673,12 +2722,10 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_comput return true; } - struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) : - device(device), - name(GGML_QNN_NAME + std::to_string(device)) { - } + ggml_backend_qnn_buffer_context(size_t device) + : device(device) + , name(GGML_QNN_NAME + std::to_string(device)) {} ~ggml_backend_qnn_buffer_context() { if (buffer) { @@ -2697,83 +2744,82 @@ struct ggml_backend_qnn_buffer_context { sub_buffers.clear(); qnn_tensors.clear(); } - void * buffer = nullptr; + void * buffer = nullptr; struct ggml_backend_qnn_context * backend_ctx = nullptr; - size_t buffer_size = 0; - std::vector sub_buffers; + size_t buffer_size = 0; + std::vector sub_buffers; std::vector qnn_tensors; - size_t device; - std::string name; + size_t device; + std::string name; }; - struct ggml_backend_qnn_buffer_type_context { - size_t device; + size_t device; std::string name; }; - static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return "QNN"; } - GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { return buffer->iface.get_name == ggml_backend_qnn_buffer_get_name; } - GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; delete ctx; } - GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; return ctx->buffer; } +GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = + (ggml_backend_qnn_buffer_context *) buffer->context; -GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - - static int idx = 0; - char tensor_name[GGML_MAX_NAME] = { 0 }; + static int idx = 0; + char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); - uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]}; - Qnn_DataType_t qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type); - Qnn_TensorType_t qnn_tensor_type= QNN_TENSOR_TYPE_APP_WRITE; + uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1], + (uint32_t) tensor->ne[2], + (uint32_t) tensor->ne[3]}; + Qnn_DataType_t qnn_data_type = + qnn_datatype_from_ggml_datatype(tensor->type); + Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } - Qnn_Tensor_t qnn_tensor = { - .version= QNN_TENSOR_VERSION_1, - {.v1= { - .id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = ggml_get_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, - .dataSize = 0}}}} - }; - Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); + Qnn_Tensor_t qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = ggml_get_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t * p_qnn_tensor = + (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { QNN_LOG_WARN("calloc failed"); return; @@ -2788,21 +2834,24 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t ctx->qnn_tensors.push_back(p_qnn_tensor); } - -GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, const void * data, + size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *)tensor->data + offset, data, size); + memcpy((char *) tensor->data + offset, data, size); } - -GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_CALL static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, void * data, + size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy(data, (const char *)tensor->data + offset, size); + memcpy(data, (const char *) tensor->data + offset, size); } - -GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const struct ggml_tensor * src, + struct ggml_tensor * dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -2812,35 +2861,31 @@ GGML_CALL static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t b return false; } - GGML_CALL static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; memset(ctx->buffer, value, ctx->buffer_size); } - static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { - /* .get_name = */ ggml_backend_qnn_buffer_get_name, - /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, - /* .get_base = */ ggml_backend_qnn_buffer_get_base, - /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, - /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, - /* .clear = */ ggml_backend_qnn_buffer_clear, - /* .reset = */ nullptr, + /* .get_name = */ ggml_backend_qnn_buffer_get_name, + /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, + /* .get_base = */ ggml_backend_qnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_qnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_qnn_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_qnn_buffer_cpy_tensor, + /* .clear = */ ggml_backend_qnn_buffer_clear, + /* .reset = */ nullptr, }; - GGML_CALL static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { return "QNN"; } - static void * ggml_qnn_host_malloc(size_t n) { void * data = nullptr; - const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { QNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__); return nullptr; @@ -2849,20 +2894,20 @@ static void * ggml_qnn_host_malloc(size_t n) { return data; } - -GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer( + ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_qnn_buffer_type_context * buft_ctx = (ggml_backend_qnn_buffer_type_context *)buft->context; ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context(buft_ctx->device); - const size_t size_page = sysconf(_SC_PAGESIZE); + size_t size_page = sysconf(_SC_PAGESIZE); size_t size_aligned = size; if ((size_aligned % size_page) != 0) { size_aligned += (size_page - (size_aligned % size_page)); } - //TODO:use pre-allocated buffer in internal memory pool - ctx->buffer = ggml_qnn_host_malloc(size_aligned); + // TODO:use pre-allocated buffer in internal memory pool + ctx->buffer = ggml_qnn_host_malloc(size_aligned); ctx->buffer_size = size_aligned; ctx->backend_ctx = &g_qnn_mgr[buft_ctx->device]; @@ -2872,53 +2917,51 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer return nullptr; } - return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface,ctx, size); } - -GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_alignment( + ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return 32; } - -//TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android +// TODO: this value is an experimental value, works fine with whisper/llm/minicpm-v inference on Android GGML_CALL static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return (96 * 1024 * 1024); } - -GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, - ggml_backend_t backend) { +GGML_CALL static bool ggml_backend_qnn_buffer_type_supports_backend( + ggml_backend_buffer_type_t buft, ggml_backend_t backend) { GGML_UNUSED(buft); return ggml_backend_is_qnn(backend) || ggml_backend_is_cpu(backend); } - GGML_CALL static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); return true; } - GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { return "QNN"; } - GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { - QNN_LOG_INFO("enter %s", __func__ ); + QNN_LOG_INFO("enter %s", __func__); ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); - qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance; + qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { - std::map>::iterator graph_it; - for (graph_it = instance->_qnn_graph_map.begin(); graph_it != instance->_qnn_graph_map.end(); graph_it++) { - auto & graph_item = graph_it->second; + std::map>::iterator graph_it; + for (graph_it = instance->_qnn_graph_map.begin(); + graph_it != instance->_qnn_graph_map.end(); graph_it++) { + auto & graph_item = graph_it->second; Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); GGML_UNUSED(graph_handle); QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); @@ -2930,96 +2973,90 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { g_qnn_mgr[ctx->device].instance = nullptr; } - if (g_qnn_mgr[ctx->device].backend != nullptr) { + if (g_qnn_mgr[ctx->device].backend != nullptr) { delete backend; g_qnn_mgr[ctx->device].backend = nullptr; } - QNN_LOG_INFO("leave %s", __func__ ); + QNN_LOG_INFO("leave %s", __func__); } - GGML_CALL static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; return ggml_backend_qnn_buffer_type(ctx->device); } - GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - enum ggml_status result = GGML_STATUS_SUCCESS; - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; + enum ggml_status result = GGML_STATUS_SUCCESS; + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; GGML_UNUSED(ctx); ggml_compute_params params = {}; - params.type = GGML_TASK_TYPE_COMPUTE; - params.ith = 0; + params.type = GGML_TASK_TYPE_COMPUTE; + params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + ggml_tensor *node = cgraph->nodes[i]; + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || + node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } bool ok = ggml_qnn_compute_forward(ctx, ¶ms, node); if (!ok) { - QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); } } return result; } +GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, + const ggml_tensor * op) { + ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *) backend->context; -GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { - GGML_UNUSED(backend); - - return (ggml_qnn_can_handle_op(op, true)); + return (ggml_qnn_can_handle_op(ctx, op, true)); } +GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { + ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; -//note: this function be used with proposal/refined ggml backend subsystem in this PR: -// https://github.com/ggerganov/llama.cpp/pull/7641 -// new ggml backend(only using system memory: ggml_backend_xxx_buffer_is_host return true) -// can following this style for mixed inference between CPU&GPU / CPU&NPU very easily -GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) { - ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - - return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor*)tensor); + return ggml_qnn_compute_forward(ctx, nullptr, (ggml_tensor *) tensor); } - static ggml_backend_i ggml_backend_qnn_interface = { - /* .get_name = */ ggml_backend_qnn_name, - /* .free = */ ggml_backend_qnn_free, - /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, - /* .set_tensor_async = */ nullptr, - /* .get_tensor_async = */ nullptr, - /* .cpy_tensor_async = */ nullptr, - /* .synchronize = */ nullptr, - /* .graph_plan_create = */ nullptr, - /* .graph_plan_free = */ nullptr, - /* .graph_plan_compute = */ nullptr, - /* .graph_compute = */ ggml_backend_qnn_graph_compute, - /* .supports_op = */ ggml_backend_qnn_supports_op, - /* .offload_op = */ ggml_backend_qnn_offload_op, - /* .event_new = */ nullptr, - /* .event_free = */ nullptr, - /* .event_record = */ nullptr, - /* .event_wait = */ nullptr, - /* .event_synchronize = */ nullptr, + /* .get_name = */ ggml_backend_qnn_name, + /* .free = */ ggml_backend_qnn_free, + /* .get_default_buffer_type = */ ggml_backend_qnn_get_default_buffer_type, + /* .set_tensor_async = */ nullptr, + /* .get_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ nullptr, + /* .synchronize = */ nullptr, + /* .graph_plan_create = */ nullptr, + /* .graph_plan_free = */ nullptr, + /* .graph_plan_compute = */ nullptr, + /* .graph_compute = */ ggml_backend_qnn_graph_compute, + /* .supports_op = */ ggml_backend_qnn_supports_op, + /* .offload_op = */ ggml_backend_qnn_offload_op, + /* .event_new = */ nullptr, + /* .event_free = */ nullptr, + /* .event_record = */ nullptr, + /* .event_wait = */ nullptr, + /* .event_synchronize = */ nullptr, }; - static ggml_guid_t ggml_backend_qnn_guid() { - static ggml_guid guid = {0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, 0x92, 0xa3, 0xb4, 0xc5, - 0xd6, 0xe7, 0xf8, 0x09}; + static ggml_guid guid = { + 0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81, + 0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09 + }; return &guid; } - static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user_data) { if (nullptr == params) { - //QNN library path - //can be hardcoded to "/data/local/tmp/" for Android command line application - //or specified in JNI layer for Android APK + // QNN library path + // can be hardcoded to "/data/local/tmp/" for Android command line application + // or specified in JNI layer for Android APK params = "/data/local/tmp/"; } ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) user_data, params); @@ -3027,30 +3064,25 @@ static ggml_backend_t ggml_backend_qnn_reg_init(const char * params, void * user return qnn_backend; } - bool ggml_backend_is_qnn(ggml_backend_t backend) { return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } - void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { GGML_ASSERT(ggml_backend_is_qnn(backend)); - struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *) backend->context; ctx->threads = n_threads; } - const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { return backend->iface.get_name(backend); } - int ggml_backend_qnn_get_device_count() { return GGML_QNN_MAX_DEVICES; } - void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, size_t description_size) { if (nullptr == description || 0 == description_size) { QNN_LOG_WARN("invalid param"); @@ -3063,14 +3095,13 @@ void ggml_backend_qnn_get_device_description(size_t dev_num, char * description, } snprintf(description, description_size, "%s", g_qnn_mgr[dev_num].name); - QNN_LOG_DEBUG("description:%s", description); } - ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { if (device >= GGML_QNN_MAX_DEVICES) { - QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n", - device, GGML_QNN_MAX_DEVICES - 1); + QNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is " + "out of range [0, %d]\n", + device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } @@ -3086,11 +3117,12 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { /* .alloc_buffer = */ ggml_backend_qnn_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_qnn_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr,// defaults to ggml_nbytes + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, /* .is_host = */ ggml_backend_qnn_buffer_is_host - }, - /* .context = */ new ggml_backend_qnn_buffer_type_context { device, GGML_QNN_NAME + std::to_string(device) }, + }, + /* .context = */ new ggml_backend_qnn_buffer_type_context { device, + GGML_QNN_NAME + std::to_string(device)}, }; } ggml_backend_qnn_buffer_type_initialized = true; @@ -3099,7 +3131,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return &ggml_backend_qnn_buffer_types[device]; } - /** * * @param device 0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU @@ -3124,8 +3155,9 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string path = qnn_lib_path; if (QNN_BACKEND_NPU == device) { if (0 == setenv("LD_LIBRARY_PATH", - (path + - ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(), + (path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/" + "dsp:/vendor/dsp/images") + .c_str(), 1)) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { @@ -3133,31 +3165,35 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } if (0 == setenv("ADSP_LIBRARY_PATH", (path + - ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(), + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/" + "rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp") + .c_str(), 1)) { QNN_LOG_INFO("QNN NPU backend setenv successfully"); } else { QNN_LOG_ERROR("QNN NPU backend setenv failure"); } } else { - if (0 == setenv("LD_LIBRARY_PATH", - path.c_str(), - 1)) { - QNN_LOG_INFO("%s backend setenv successfully\n", get_qnn_backend_name(device)); + if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { + QNN_LOG_INFO("%s backend setenv successfully\n", + get_qnn_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", get_qnn_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure\n", + get_qnn_backend_name(device)); } } qnn_instance * instance = nullptr; instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, ""); - result = instance->qnn_init(nullptr); + result = instance->qnn_init(nullptr); if (0 != result) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", get_qnn_backend_name(device)); + QNN_LOG_WARN( + "init qnn subsystem failed with qnn backend %s, pls check why\n", + get_qnn_backend_name(device)); delete instance; return nullptr; } - qnn_interface qnn_interface = instance->get_qnn_interface(); + qnn_interface qnn_interface = instance->get_qnn_interface(); if (!qnn_interface.is_loaded()) { QNN_LOG_WARN("qnn subsystem failure\n"); delete instance; @@ -3167,29 +3203,28 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string device_name = get_qnn_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); instance->init_qnn_graph(device_name.c_str(), false); - g_qnn_mgr[device].instance = instance; - g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); - g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); + g_qnn_mgr[device].instance = instance; + g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); + g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); - ggml_backend_t qnn_backend = new ggml_backend{ - /* .guid = */ ggml_backend_qnn_guid(), - /* .iface = */ ggml_backend_qnn_interface, - /* .context = */ &g_qnn_mgr[device] - }; - g_qnn_mgr[device].backend = qnn_backend; + ggml_backend_t qnn_backend = + new ggml_backend{/* .guid = */ ggml_backend_qnn_guid(), + /* .iface = */ ggml_backend_qnn_interface, + /* .context = */ &g_qnn_mgr[device]}; + g_qnn_mgr[device].backend = qnn_backend; return qnn_backend; } - extern "C" GGML_CALL int ggml_backend_qnn_reg_devices(void); GGML_CALL int ggml_backend_qnn_reg_devices() { for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) { char name[GGML_MAX_NAME]; ggml_backend_qnn_get_device_description(idx, name, GGML_MAX_NAME); - ggml_backend_register(name, ggml_backend_qnn_reg_init, ggml_backend_qnn_buffer_type(idx), - (void *) (intptr_t)idx); + ggml_backend_register(name, ggml_backend_qnn_reg_init, + ggml_backend_qnn_buffer_type(idx), + (void *) (intptr_t) idx); } return GGML_QNN_MAX_DEVICES; From 3e8b61f9702a702bfe14478bdc4eb466038643dd Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sun, 9 Jun 2024 09:06:44 +0800 Subject: [PATCH 11/16] review: fix a memory leak introduced by review modification which explained in https://github.com/zhouwg/llama.cpp/pull/1 --- ggml-qnn.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 3c5ff332a1df2..d1d69afe2eef5 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -2771,6 +2771,7 @@ GGML_CALL static bool ggml_backend_buffer_is_qnn(ggml_backend_buffer_t buffer) { GGML_CALL static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + delete ctx; } @@ -3105,12 +3106,14 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return nullptr; } + //ref:https://github.com/zhouwg/llama.cpp/pull/1 + static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - static bool ggml_backend_qnn_buffer_type_initialized = false; - if (!ggml_backend_qnn_buffer_type_initialized) { - for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { + auto & context = ggml_backend_qnn_buffer_type_contexts[i]; + context = { i, std::string(GGML_QNN_NAME) + std::to_string(i) }; ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, @@ -3121,8 +3124,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { /* .supports_backend = */ ggml_backend_qnn_buffer_type_supports_backend, /* .is_host = */ ggml_backend_qnn_buffer_is_host }, - /* .context = */ new ggml_backend_qnn_buffer_type_context { device, - GGML_QNN_NAME + std::to_string(device)}, + /* .context = */ & context, }; } ggml_backend_qnn_buffer_type_initialized = true; From d38d4a67d17570d3b3003397a50f873f5e143603 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Sun, 9 Jun 2024 23:49:54 +0800 Subject: [PATCH 12/16] npu: probe htp info and capacity of rpc ion memory --- ggml-qnn.cpp | 123 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 115 insertions(+), 8 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index d1d69afe2eef5..3248e244a31c2 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -152,6 +152,28 @@ enum class ggml_qnn_profile_level { profile_detail = 2 }; +enum qcom_htp_arch { + NONE = 0, + V68 = 68, + V69 = 69, + V73 = 73, + V75 = 75, +}; + +enum qcom_chipset { + UNKNOWN_SM = 0, + SM8450 = 36, // v69 + SM8475 = 42, // v69 + SM8550 = 43, // v73 + SM8650 = 57, // v75 +}; + +struct qcom_socinfo { + int soc_model; + int htp_arch; + int vtcm_size_in_mb; +}; + struct ggml_backend_qnn_context { int device; int threads; @@ -216,6 +238,29 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .raw_system_interface = {}}, }; +static struct qcom_socinfo g_qnn_soc_info_table[] = { + /* Qualcomm SnapDragon 8 Gen 1 */ + [SM8450] = {.soc_model = SM8450, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + [SM8475] = {.soc_model = SM8475, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 2 */ + [SM8550] = {.soc_model = SM8550, + .htp_arch = V73, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 3 */ + [SM8650] = {.soc_model = SM8650, + .htp_arch = V75, + .vtcm_size_in_mb = 8}, + +}; + // ================================================================================================= // // QNN helper functions and other internal helper functions @@ -485,6 +530,8 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { return QNN_DATATYPE_INT_8; case GGML_TYPE_Q8_0: return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; default: break; } @@ -527,19 +574,34 @@ Fn load_qnn_functionpointers(void * handle, const char * function_name) { static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { - case 0: + case QNN_BACKEND_CPU: return "QNN-CPU"; - case 1: + case QNN_BACKEND_GPU: return "QNN-GPU"; - case 2: + case QNN_BACKEND_NPU: return "QNN-NPU"; - case 3: + case QNN_BACKEND_GGML: return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML default: return "unknown"; } } +static const char * qnn_get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; + } +} + static intptr_t align_to(size_t alignment, intptr_t offset) { return offset % alignment == 0 ? offset @@ -875,7 +937,7 @@ class qnn_instance { return 0; } - std::string &get_qnn_graph_name() { return _graph_name; } + std::string & get_qnn_graph_name() { return _graph_name; } bool is_rpcmem_initialized() { return _rpcmem_initialized; } @@ -893,6 +955,8 @@ class qnn_instance { void free_rpcmem(void * buf); + size_t get_rpcmem_capacity() { return _rpcmem_capacity; } + bool is_rpcmem_allocated(void * buf); bool is_rpcmem_registered(Qnn_MemHandle_t handle) { @@ -977,6 +1041,7 @@ class qnn_instance { pfn_rpc_mem_init _pfn_rpc_mem_init; pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; std::string _graph_name; }; @@ -1493,6 +1558,46 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { QNN_LOG_DEBUG("initialize qnn context successfully\n"); } + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t chiparch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d, vtcm_size_in_mb:%d MB", chipinfo.socModel, + qnn_get_chipset_desc(chipinfo.socModel), chiparch, chipinfo.vtcmSize); + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + + //TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + const int SIZE_IN_MB = (1 << 20); + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + } + QNN_LOG_DEBUG("leave qni_init\n"); return 0; @@ -1654,9 +1759,11 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, const int64_t ne20 = tensor->ne[0]; const int64_t ne21 = tensor->ne[1]; - //TODO: support other quatinized data type - if (ggml_is_quantized(src0->type) && (src0->type != GGML_TYPE_Q8_0)) { - return false; + //TODO: support other quantized data type + if (ggml_is_quantized(src0->type)) { + if ((src0->type != GGML_TYPE_Q8_0) && (src0->type != GGML_TYPE_Q4_0)) { + return false; + } } if (b_dump_tensor_info) { From 5f8cfe4a1eecab1504dea1451f7d4b4e7983d7b9 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Mon, 10 Jun 2024 20:07:26 +0800 Subject: [PATCH 13/16] ggml-qnn: refine source code of ggml-qnn.cpp to make reviewer more happy --- ggml-qnn.cpp | 2654 +++++++++++++++++++++++++------------------------- 1 file changed, 1327 insertions(+), 1327 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 3248e244a31c2..43a8fcd3ea8cb 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -32,8 +33,17 @@ #include #include #include -#include +#if (defined __ANDROID__) || (defined ANDROID) +#include +#endif + +#include "ggml-qnn.h" + +#include "ggml-backend-impl.h" + +// header file of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct #include "QnnTypes.h" #include "QnnCommon.h" #include "QnnContext.h" @@ -46,14 +56,6 @@ #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" -#include "ggml-qnn.h" - -#include "ggml-backend-impl.h" - -#if (defined __ANDROID__) || (defined ANDROID) -#include -#endif - // ================================================================================================= // // forward declaration @@ -61,96 +63,31 @@ // ================================================================================================= class qnn_instance; -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...); +struct ggml_backend_qnn_context; + +static int free_qnn_tensor(Qnn_Tensor_t & tensor); // ================================================================================================= // // self-defined macro / data structure // // ================================================================================================= -#define RPCMEM_DEFAULT_FLAGS 1 -#define RPCMEM_HEAP_ID_SYSTEM 25 - -#define GGML_QNN_DEBUG 1 // for troubleshooting QNN backend -#define ENABLE_QNN_LOG 0 // enable/disable QNN internal log -#define GGML_QNN_LOGBUF_LEN 4096 -#define QNN_VER_PTR(x) (&((x).v1)) -#define GGML_QNN_NAME "qnn" - -#define QNN_LOG_ERROR(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#define QNN_LOG_WARN(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#define QNN_LOGBUF_LEN 4096 +#define QNN_BACKEND_NAME "qnn" -#define QNN_LOG_INFO(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) - -#if GGML_QNN_DEBUG -#define QNN_LOG_DEBUG(...) \ - ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define QNN_LOG_DEBUG(...) -#endif - -#define VALIDATE(value, status) \ - do { \ - status = value; \ - if (status != QNN_SUCCESS) { \ - QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ - return status; \ - } \ - } while (0) +typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); -#define VALIDATE_TENSOR_VERSION(tensor, err) \ - VALIDATE(validate_tensor_version(tensor), err) - -#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) -#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) -#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) -#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) -#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) -#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) -#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) -#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) -#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) - -#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) -#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) -#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) -#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) \ - set_qnn_tensor_dataformat(tensor, value) -#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) \ - set_qnn_tensor_datatype(tensor, value) -#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) \ - set_qnn_tensor_quantparams(tensor, value) -#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) -#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) \ - set_qnn_tensor_dimensions(tensor, value) -#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) \ - set_qnn_tensor_memtype(tensor, value) -#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) \ - set_qnn_tensor_clientbuf(tensor, value) -#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) \ - set_qnn_tensor_memhandle(tensor, value) - -using pfn_rpc_mem_init = void (*)(void); -using pfn_rpc_mem_deinit = void (*)(void); -using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); -using pfn_rpc_mem_free = void (*)(void *); -using pfn_rpc_mem_to_fd = int (*)(void *); - -using _pfn_QnnSaver_initialize = decltype(QnnSaver_initialize); -using _pfn_QnnInterface_getProviders = decltype(QnnInterface_getProviders); -using _pfn_QnnSystemInterface_getProviders = decltype(QnnSystemInterface_getProviders); - -enum class ggml_qnn_profile_level { - profile_off = 0, - profile_basic = 1, - profile_detail = 2 -}; +typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, + const ggml_op ggml_op, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); enum qcom_htp_arch { NONE = 0, @@ -169,9 +106,36 @@ enum qcom_chipset { }; struct qcom_socinfo { - int soc_model; - int htp_arch; - int vtcm_size_in_mb; + uint32_t soc_model; + size_t htp_arch; + size_t vtcm_size_in_mb; +}; + +static struct qcom_socinfo g_qnn_soc_info_table[] = { + /* Qualcomm SnapDragon 8 Gen 1 */ + [SM8450] = { + .soc_model = SM8450, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 1+ */ + [SM8475] = { + .soc_model = SM8475, + .htp_arch = V69, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 2 */ + [SM8550] = { + .soc_model = SM8550, + .htp_arch = V73, + .vtcm_size_in_mb = 8}, + + /* Qualcomm SnapDragon 8 Gen 3 */ + [SM8650] = { + .soc_model = SM8650, + .htp_arch = V75, + .vtcm_size_in_mb = 8}, + }; struct ggml_backend_qnn_context { @@ -183,19 +147,9 @@ struct ggml_backend_qnn_context { struct ggml_backend * backend; QNN_INTERFACE_VER_TYPE raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface; + struct qcom_socinfo socinfo; }; -typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - -typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, - const ggml_op ggml_op, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - // according to the QNN SDK Reference Guide, // CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend // GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend @@ -217,7 +171,8 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .instance = nullptr, .backend = nullptr, .raw_interface = {}, - .raw_system_interface = {}}, + .raw_system_interface = {}, + .socinfo = {}}, [QNN_BACKEND_GPU] = {.device = 1, .threads = 1, @@ -226,7 +181,8 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .instance = nullptr, .backend = nullptr, .raw_interface = {}, - .raw_system_interface = {}}, + .raw_system_interface = {}, + .socinfo = {}}, [QNN_BACKEND_NPU] = {.device = 2, .threads = 1, @@ -235,128 +191,425 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { .instance = nullptr, .backend = nullptr, .raw_interface = {}, - .raw_system_interface = {}}, + .raw_system_interface = {}, + .socinfo = {}}, }; -static struct qcom_socinfo g_qnn_soc_info_table[] = { - /* Qualcomm SnapDragon 8 Gen 1 */ - [SM8450] = {.soc_model = SM8450, - .htp_arch = V69, - .vtcm_size_in_mb = 8}, +struct ggml_backend_qnn_buffer_context { + ggml_backend_qnn_buffer_context(size_t device) + : device(device) + , name(QNN_BACKEND_NAME + std::to_string(device)) {} - /* Qualcomm SnapDragon 8 Gen 1+ */ - [SM8475] = {.soc_model = SM8475, - .htp_arch = V69, - .vtcm_size_in_mb = 8}, + ~ggml_backend_qnn_buffer_context() { + if (buffer) { + free(buffer); + } - /* Qualcomm SnapDragon 8 Gen 2 */ - [SM8550] = {.soc_model = SM8550, - .htp_arch = V73, - .vtcm_size_in_mb = 8}, + for (auto * sub_buffer : sub_buffers) { + free(sub_buffer); + } - /* Qualcomm SnapDragon 8 Gen 3 */ - [SM8650] = {.soc_model = SM8650, - .htp_arch = V75, - .vtcm_size_in_mb = 8}, + for (auto * qnn_tensor : qnn_tensors) { + free_qnn_tensor(*qnn_tensor); + free(qnn_tensor); + } + + sub_buffers.clear(); + qnn_tensors.clear(); + } + void * buffer = nullptr; + + struct ggml_backend_qnn_context * backend_ctx = nullptr; + + size_t buffer_size = 0; + std::vector sub_buffers; + std::vector qnn_tensors; + size_t device; + std::string name; +}; +struct ggml_backend_qnn_buffer_type_context { + size_t device; + std::string name; }; // ================================================================================================= // -// QNN helper functions and other internal helper functions +// QNN backend internal log function // // ================================================================================================= -static inline int validate_tensor_version(Qnn_Tensor_t tensor) { - if (tensor.version != QNN_TENSOR_VERSION_1) { - QNN_LOG_WARN( - "validate_tensor_version() tensor %s, got unsupported version %d\n", - tensor.v1.name, tensor.version); - return 1; +static void qnn_internal_log(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...); +#define QNN_LOG_ERROR(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_WARN(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#define QNN_LOG_INFO(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) + +#if ENABLE_QNNBACKEND_DEBUG +#define QNN_LOG_DEBUG(...) \ + qnn_internal_log(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) +#else +#define QNN_LOG_DEBUG(...) +#endif + +// ================================================================================================= +// +// QNN backend internal helper functions +// +// ================================================================================================= +static uint32_t qnn_get_ggml_tensor_rank(const ggml_tensor * tensor) { + uint32_t rank = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { + rank++; + } } - return 0; + return rank; } -static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.id; +// TODO: mapping more ggml data type to QNN data type +// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 +static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { + switch (ggmltype) { + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_F32: + return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_I8: + return QNN_DATATYPE_INT_8; + case GGML_TYPE_Q8_0: + return QNN_DATATYPE_SFIXED_POINT_8; + case GGML_TYPE_Q4_0: + return QNN_DATATYPE_SFIXED_POINT_4; + default: + break; } - - return 0u; + return QNN_DATATYPE_UNDEFINED; } -static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.name; +// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT +static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { + switch (ggmlop) { + case GGML_OP_ADD: + return QNN_OP_ELEMENT_WISE_ADD; + case GGML_OP_MUL: + return QNN_OP_ELEMENT_WISE_MULTIPLY; + case GGML_OP_MUL_MAT: + return QNN_OP_MAT_MUL; + default: + break; } return nullptr; } -static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.type; +static uint32_t qnn_get_ggml_tensor_data_size(const ggml_tensor * tensor) { + /* + size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); + size_t n_dims = qnn_get_ggml_tensor_rank(tensor); + for (int i = 1; i < n_dims; i++) { + data_size *= tensor->ne[i]; } - return QNN_TENSOR_TYPE_UNDEFINED; + + return data_size; + */ + return ggml_nbytes(tensor); } -static inline Qnn_TensorDataFormat_t - get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataFormat; +static const char * qnn_get_backend_name(int n_backend_type) { + switch (n_backend_type) { + case QNN_BACKEND_CPU: + return "QNN-CPU"; + case QNN_BACKEND_GPU: + return "QNN-GPU"; + case QNN_BACKEND_NPU: + return "QNN-NPU"; + case QNN_BACKEND_GGML: + return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML + default: + return "unknown"; } - return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; } -static inline Qnn_DataType_t - get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dataType; +static const char * qnn_get_chipset_desc(uint32_t chipset_id) { + switch (chipset_id) { + case SM8450: + return "SM8450"; + case SM8475: + return "SM8475"; + case SM8550: + return "SM8550"; + case SM8650: + return "SM8650"; + default: + return "unknown"; } - return QNN_DATATYPE_UNDEFINED; } -static inline Qnn_QuantizeParams_t - get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.quantizeParams; +static const char * qnn_get_htparch_desc(size_t htp_arch) { + switch (htp_arch) { + case V68: + return "QCOM_HTP_V68"; + case V69: + return "QCOM_HTP_V69"; + case V73: + return "QCOM_HTP_V73"; + case V75: + return "QCOM_HTP_V75"; + default: + return "unknown"; } - return QNN_QUANTIZE_PARAMS_INIT; } -static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.rank; +static void qnn_internal_log(ggml_log_level level, const char * file, + const char * func, int line, + const char * format, ...) { + static std::mutex qnn_internal_log_mutex; + static char s_qnn_internal_log_buf[QNN_LOGBUF_LEN]; + + { + std::lock_guard lock(qnn_internal_log_mutex); + va_list args; + + va_start(args, format); + int len_prefix = + snprintf(s_qnn_internal_log_buf, QNN_LOGBUF_LEN, + "[%s, %d]: ", func, line); + int len = vsnprintf(s_qnn_internal_log_buf + len_prefix, + QNN_LOGBUF_LEN - len_prefix, format, args); + if (len < (QNN_LOGBUF_LEN - len_prefix)) { +#if (defined __ANDROID__) || (defined ANDROID) + // for Android APK + __android_log_print(level, "ggml-qnn", "%s\n", s_qnn_internal_log_buf); +#endif + // for Android command line application or WoA(Windows on ARM) + printf("%s\n", s_qnn_internal_log_buf); + } + va_end(args); } - return 0u; } -static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.dimensions; + +static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { + QNN_LOG_WARN("invalid params\n"); + return false; } - return nullptr; -} -static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - return tensor.v1.memType; + qnn_instance * instance = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + instance = ctx->instance; + if ((nullptr == instance) || (nullptr == tensor_0) || (nullptr == tensor_1) || (nullptr == tensor_2)) { + QNN_LOG_WARN("invalid params\n"); + return false; } - return QNN_TENSORMEMTYPE_UNDEFINED; + + return true; } -static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.id = id; +#define CHECK_PARAMS(ctx, src0, src1, dst) \ + do { \ + if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ + return; \ + } \ + } while (0) + +#if ENABLE_QNNBACKEND_PERF +class qnn_perf { +public: + qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}; + qnn_perf() = delete; + qnn_perf(const qnn_perf & ) = delete; + qnn_perf & operator= (const qnn_perf & ) = delete; + + void start() { + _begin_time = ggml_time_us(); } -} -static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { - if (tensor.version == QNN_TENSOR_VERSION_1) { - tensor.v1.name = name; + void info() { + _end_time = ggml_time_us(); + _duration = (_end_time - _begin_time) / 1000; + QNN_LOG_DEBUG("duration of %s : %lld milliseconds\n", _perf_name.c_str(), _duration); } -} -static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { - if (tensor.version == QNN_TENSOR_VERSION_1) { +private: + int64_t _begin_time = 0LL; + int64_t _end_time = 0LL; + int64_t _duration = 0LL; + std::string _perf_name; +}; +#else +class qnn_perf { +public: + qnn_perf(const std::string & perf_name) {} + qnn_perf() = delete; + qnn_perf(const qnn_perf & ) = delete; + qnn_perf & operator= (const qnn_perf & ) = delete; + + void start() {} + void info() {} +}; +#endif + +// ================================================================================================= +// +// helper data type / data structure / macros / functions of +// Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm +// ================================================================================================= +enum qnn_sdk_profile_level { + profile_off = 0, + profile_basic = 1, + profile_detail = 2 +}; + +using _pfn_rpc_mem_init = void (*)(void); +using _pfn_rpc_mem_deinit = void (*)(void); +using _pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using _pfn_rpc_mem_free = void (*)(void *); +using _pfn_rpc_mem_to_fd = int (*)(void *); + +using _pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using _pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using _pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); + +#define QNN_VER_PTR(x) (&((x).v1)) +#define RPCMEM_DEFAULT_FLAGS 1 +#define RPCMEM_HEAP_ID_SYSTEM 25 + +#define VALIDATE(value, status) \ + do { \ + status = value; \ + if (status != QNN_SUCCESS) { \ + QNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value); \ + return status; \ + } \ + } while (0) + +#define QNN_TENSOR_GET_ID(tensor) get_qnn_tensorid(tensor) +#define QNN_TENSOR_GET_NAME(tensor) get_qnn_tensorname(tensor) +#define QNN_TENSOR_GET_TYPE(tensor) get_qnn_tensortype(tensor) +#define QNN_TENSOR_GET_DATA_FORMAT(tensor) get_qnn_tensor_dataformat(tensor) +#define QNN_TENSOR_GET_DATA_TYPE(tensor) get_qnn_tensor_datatype(tensor) +#define QNN_TENSOR_GET_QUANT_PARAMS(tensor) get_qnn_tensor_quantparams(tensor) +#define QNN_TENSOR_GET_RANK(tensor) get_qnn_tensor_rank(tensor) +#define QNN_TENSOR_GET_DIMENSIONS(tensor) get_qnn_tensor_dimensions(tensor) +#define QNN_TENSOR_GET_MEM_TYPE(tensor) get_qnn_tensor_memtype(tensor) + +#define QNN_TENSOR_SET_ID(tensor, value) set_qnn_tensor_id(tensor, value) +#define QNN_TENSOR_SET_NAME(tensor, value) set_qnn_tensor_name(tensor, value) +#define QNN_TENSOR_SET_TYPE(tensor, value) set_qnn_tensor_type(tensor, value) +#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value) set_qnn_tensor_dataformat(tensor, value) +#define QNN_TENSOR_SET_DATA_TYPE(tensor, value) set_qnn_tensor_datatype(tensor, value) +#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value) set_qnn_tensor_quantparams(tensor, value) +#define QNN_TENSOR_SET_RANK(tensor, value) set_qnn_tensor_rank(tensor, value) +#define QNN_TENSOR_SET_DIMENSIONS(tensor, value) set_qnn_tensor_dimensions(tensor, value) +#define QNN_TENSOR_SET_MEM_TYPE(tensor, value) set_qnn_tensor_memtype(tensor, value) +#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value) set_qnn_tensor_clientbuf(tensor, value) +#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value) set_qnn_tensor_memhandle(tensor, value) +#define VALIDATE_TENSOR_VERSION(tensor, err) VALIDATE(validate_tensor_version(tensor), err) + +static inline int validate_tensor_version(Qnn_Tensor_t tensor) { + if (tensor.version != QNN_TENSOR_VERSION_1) { + QNN_LOG_WARN( + "validate_tensor_version() tensor %s, got unsupported version %d\n", + tensor.v1.name, tensor.version); + return 1; + } + return 0; +} + +static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.id; + } + + return 0u; +} + +static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.name; + } + return nullptr; +} + +static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.type; + } + return QNN_TENSOR_TYPE_UNDEFINED; +} + +static inline Qnn_TensorDataFormat_t + get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataFormat; + } + return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER; +} + +static inline Qnn_DataType_t + get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dataType; + } + return QNN_DATATYPE_UNDEFINED; +} + +static inline Qnn_QuantizeParams_t + get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.quantizeParams; + } + return QNN_QUANTIZE_PARAMS_INIT; +} + +static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.rank; + } + return 0u; +} + +static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.dimensions; + } + return nullptr; +} + +static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + return tensor.v1.memType; + } + return QNN_TENSORMEMTYPE_UNDEFINED; +} + +static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.id = id; + } +} + +static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) { + if (tensor.version == QNN_TENSOR_VERSION_1) { + tensor.v1.name = name; + } +} + +static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) { + if (tensor.version == QNN_TENSOR_VERSION_1) { tensor.v1.type = type; } } @@ -419,18 +672,13 @@ static size_t memscpy(void * dst, size_t dst_size, const void * src, size_t copy return min_size; } -static char * ggml_qnn_strndup(const char * source, size_t maxlen) { - return ::strndup(source, maxlen); -} - static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { int err = 0; VALIDATE_TENSOR_VERSION(src, err); dst.version = src.version; QNN_TENSOR_SET_NAME( - dst, ggml_qnn_strndup(QNN_TENSOR_GET_NAME(src), - std::string(QNN_TENSOR_GET_NAME(src)).size())); + dst, ::strndup(QNN_TENSOR_GET_NAME(src),std::string(QNN_TENSOR_GET_NAME(src)).size())); if (nullptr == QNN_TENSOR_GET_NAME(dst)) { return 1; } @@ -508,140 +756,61 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor) { return err; } -static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) { - uint32_t rank = 0; - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) { - rank++; - } - } - return rank; -} - -// TODO: mapping more ggml data type to QNN data type -// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) { - switch (ggmltype) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; - case GGML_TYPE_F32: - return QNN_DATATYPE_FLOAT_32; - case GGML_TYPE_I8: - return QNN_DATATYPE_INT_8; - case GGML_TYPE_Q8_0: - return QNN_DATATYPE_SFIXED_POINT_8; - case GGML_TYPE_Q4_0: - return QNN_DATATYPE_SFIXED_POINT_4; - default: - break; - } - return QNN_DATATYPE_UNDEFINED; -} - -// TODO: only support GGML_OP_ADD/GGML_OP_MUL/GGML_OP_MUL_MAT -static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) { - switch (ggmlop) { - case GGML_OP_ADD: - return QNN_OP_ELEMENT_WISE_ADD; - case GGML_OP_MUL: - return QNN_OP_ELEMENT_WISE_MULTIPLY; - case GGML_OP_MUL_MAT: - return QNN_OP_MAT_MUL; - default: - break; - } - - return nullptr; -} - -static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) { - /* - size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); - size_t n_dims = ggml_get_tensor_rank(tensor); - for (int i = 1; i < n_dims; i++) { - data_size *= tensor->ne[i]; - } - - return data_size; - */ - return ggml_nbytes(tensor); -} - -template -Fn load_qnn_functionpointers(void * handle, const char * function_name) { +template Fn load_qnn_functionpointers(void * handle, const char * function_name) { return reinterpret_cast(dlsym(handle, function_name)); } -static const char * get_qnn_backend_name(int n_backend_type) { - switch (n_backend_type) { - case QNN_BACKEND_CPU: - return "QNN-CPU"; - case QNN_BACKEND_GPU: - return "QNN-GPU"; - case QNN_BACKEND_NPU: - return "QNN-NPU"; - case QNN_BACKEND_GGML: - return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML - default: - return "unknown"; - } -} - -static const char * qnn_get_chipset_desc(uint32_t chipset_id) { - switch (chipset_id) { - case SM8450: - return "SM8450"; - case SM8475: - return "SM8475"; - case SM8550: - return "SM8550"; - case SM8650: - return "SM8650"; - default: - return "unknown"; - } -} - static intptr_t align_to(size_t alignment, intptr_t offset) { return offset % alignment == 0 - ? offset - : offset + (static_cast(alignment) - - offset % static_cast(alignment)); + ? offset + : offset + (static_cast(alignment) - + offset % static_cast(alignment)); } -static void ggml_qnn_log_internal(ggml_log_level level, const char * file, - const char * func, int line, - const char * format, ...) { - static std::mutex ggml_qnn_log_internal_mutex; - static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN]; +static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level, + uint64_t timestamp, va_list argp) { + +#if ENABLE_QNNSDK_LOG + static std::mutex log_mutex; + static unsigned char s_ggml_qnn_logbuf[QNN_LOGBUF_LEN]; + + const char * log_level_desc = ""; + switch (level) { + case QNN_LOG_LEVEL_ERROR: + log_level_desc = "ERROR"; + break; + case QNN_LOG_LEVEL_WARN: + log_level_desc = "WARNING"; + break; + case QNN_LOG_LEVEL_INFO: + log_level_desc = "INFO"; + break; + case QNN_LOG_LEVEL_DEBUG: + log_level_desc = "DEBUG"; + break; + case QNN_LOG_LEVEL_VERBOSE: + log_level_desc = "VERBOSE"; + break; + case QNN_LOG_LEVEL_MAX: + log_level_desc = "UNKNOWN"; + break; + } + double ms = (double) timestamp / 1000000.0; { - std::lock_guard lock(ggml_qnn_log_internal_mutex); - va_list args; + std::lock_guard lock(log_mutex); - va_start(args, format); - int len_prefix = - snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, - "[%s, %d]: ", func, line); - int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, - GGML_QNN_LOGBUF_LEN - len_prefix, format, args); - if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { -#if (defined __ANDROID__) || (defined ANDROID) - // for Android APK - __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf); -#endif - // for Android command line application or WoA - printf("%s\n", s_ggml_qnn_log_internal_buf); - } - va_end(args); + memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); + vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); + QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } +#endif } // ================================================================================================= // -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI -// Engine Direct) SDK -// +// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK +// ref:https://github.com/pytorch/executorch/tree/main/backends/qualcomm // ================================================================================================= class qnn_interface { @@ -778,11 +947,6 @@ class qnn_interface { const QnnSystemInterface_t * _qnn_sys_interface = nullptr; }; -// ================================================================================================= -// -// wrapper class of Qualcomm QNN(Qualcomm Neural Network, aka Qualcomm AI Engine Direct) SDK -// -// ================================================================================================= class qnn_instance { public: using BackendIdType = decltype(QnnInterface_t{}.backendId); @@ -796,44 +960,354 @@ class qnn_instance { ~qnn_instance() {} - int qnn_init(const QnnSaver_Config_t ** saver_config); + int qnn_init(const QnnSaver_Config_t ** saver_config) { + BackendIdType backend_id = QNN_BACKEND_ID_NULL; + QNN_LOG_DEBUG("enter qni_init\n"); - int qnn_finalize(); + std::lock_guard lock(_init_mutex); - const qnn_interface & get_qnn_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + if (0 != load_system()) { + QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + return 1; + } else { + QNN_LOG_DEBUG("load QNN system lib successfully\n"); } - return _qnn_interface; - } - const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + std::string backend_lib_path = _lib_path + _backend_name; + if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { + int is_load_ok = load_backend(backend_lib_path, saver_config); + if (0 != is_load_ok) { + QNN_LOG_WARN("failed to load QNN backend\n"); + return 2; + } } - return _qnn_raw_interface; - } - const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { - if (!_qnn_interface.is_loaded()) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + backend_id = _lib_path_to_backend_id[backend_lib_path]; + if (0 == _loaded_backend.count(backend_id) || + 0 == _loaded_lib_handle.count(backend_id)) { + QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " + "loaded lib_handle count=%zu\n", + backend_lib_path.c_str(), _loaded_backend.count(backend_id), + _loaded_lib_handle.count(backend_id)); + return 3; } - return _qnn_raw_system_interface; - } - const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - const Qnn_ProfileHandle_t get_qnn_profile_handle() { - return _qnn_profile_handle; - } + _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, + &_qnn_log_handle); + if (nullptr == _qnn_log_handle) { + QNN_LOG_WARN( + "why failed to initialize qnn log\n"); // NPU backend not work on + // Qualcomm SoC equipped low-end phone + return 4; + } else { + QNN_LOG_DEBUG("initialize qnn log successfully\n"); + } - const Qnn_DeviceHandle_t get_qnn_device_handle() { - return _qnn_device_handle; - } + std::vector temp_backend_config; + _qnn_interface.qnn_backend_create( + _qnn_log_handle, + temp_backend_config.empty() ? nullptr : temp_backend_config.data(), + &_qnn_backend_handle); + if (nullptr == _qnn_backend_handle) { + QNN_LOG_WARN("why failed to initialize qnn backend\n"); + return 5; + } else { + QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + } - const Qnn_BackendHandle_t get_qnn_backend_handle() { - return _qnn_backend_handle; - } + if (nullptr != _qnn_raw_interface.propertyHasCapability) { + auto qnnStatus = + _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); + if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + QNN_LOG_WARN("device property is not supported\n"); + } + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + QNN_LOG_WARN("device property is not known to backend\n"); + } + } + + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, + &_qnn_device_handle); + if (QNN_SUCCESS != qnn_status && + QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { + QNN_LOG_WARN("failed to create QNN device\n"); + } else { + QNN_LOG_INFO("create device successfully\n"); + } + + if (qnn_sdk_profile_level::profile_off != _profile_level) { + QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); + if (qnn_sdk_profile_level::profile_basic == _profile_level) { + QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_BASIC, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 6; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } else if (qnn_sdk_profile_level::profile_detail == _profile_level) { + QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); + if (QNN_PROFILE_NO_ERROR != + _qnn_raw_interface.profileCreate(_qnn_backend_handle, + QNN_PROFILE_LEVEL_DETAILED, + &_qnn_profile_handle)) { + QNN_LOG_WARN("unable to create profile handle in the backend\n"); + return 7; + } else { + QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + } + } + } + + _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); + if (nullptr == _rpc_lib_handle) { + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); + return 8; + } else { + QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + set_rpcmem_initialized(true); + } + __pfn_rpc_mem_init = reinterpret_cast<_pfn_rpc_mem_init>( + dlsym(_rpc_lib_handle, "rpcmem_init")); + __pfn_rpc_mem_deinit = reinterpret_cast<_pfn_rpc_mem_deinit>( + dlsym(_rpc_lib_handle, "rpcmem_deinit")); + __pfn_rpc_mem_alloc = reinterpret_cast<_pfn_rpc_mem_alloc>( + dlsym(_rpc_lib_handle, "rpcmem_alloc")); + __pfn_rpc_mem_free = reinterpret_cast<_pfn_rpc_mem_free>( + dlsym(_rpc_lib_handle, "rpcmem_free")); + __pfn_rpc_mem_to_fd = reinterpret_cast<_pfn_rpc_mem_to_fd>( + dlsym(_rpc_lib_handle, "rpcmem_to_fd")); + if (nullptr == __pfn_rpc_mem_alloc || nullptr == __pfn_rpc_mem_free || + nullptr == __pfn_rpc_mem_to_fd) { + QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); + dlclose(_rpc_lib_handle); + return 9; + } + + if (nullptr != + __pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + __pfn_rpc_mem_init(); + + std::vector temp_context_config; + _qnn_interface.qnn_context_create( + _qnn_backend_handle, _qnn_device_handle, + temp_context_config.empty() ? nullptr : temp_context_config.data(), + &_qnn_context_handle); + if (nullptr == _qnn_context_handle) { + QNN_LOG_WARN("why failed to initialize qnn context\n"); + return 10; + } else { + QNN_LOG_DEBUG("initialize qnn context successfully\n"); + } + + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ + chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ + htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); + g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + + //TODO: faster approach to probe the accurate capacity of rpc ion memory + size_t candidate_size = 0; + uint8_t * rpc_buffer = nullptr; + const int SIZE_IN_MB = (1 << 20); + size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; + size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); + for (size_t idx = 0; idx < probe_counts; idx++) { + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); + if (nullptr == rpc_buffer) { + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + break; + } else { + candidate_size = probe_slots[idx]; + free_rpcmem(rpc_buffer); + rpc_buffer = nullptr; + } + } + if (candidate_size > _rpcmem_capacity) + _rpcmem_capacity = candidate_size; + QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + } + + QNN_LOG_DEBUG("leave qni_init\n"); + + return 0; + } + + int qnn_finalize() { + int ret_status = 0; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + + if (nullptr != + __pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + __pfn_rpc_mem_deinit(); + + if (dlclose(_rpc_lib_handle) != 0) { + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); + } else { + QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + } + + if (nullptr != _qnn_context_handle) { + error = _qnn_interface.qnn_context_free(_qnn_context_handle, + _qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_context_handle = nullptr; + } + + if (nullptr != _qnn_profile_handle) { + error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_profile_handle = nullptr; + } + + if (nullptr != _qnn_device_handle) { + error = _qnn_interface.qnn_device_free(_qnn_device_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_device_handle = nullptr; + } + + if (nullptr != _qnn_backend_handle) { + error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_backend_handle = nullptr; + } + + if (nullptr != _qnn_log_handle) { + error = _qnn_interface.qnn_log_free(_qnn_log_handle); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", + _qnn_interface.get_backend_id(), + QNN_GET_ERROR_CODE(error)); + } + _qnn_log_handle = nullptr; + } + + unload_backend(); + + unload_system(); + + return ret_status; + } + + int init_qnn_graph(const char * graph_name, bool debug, + uint8_t do_node_validation = true, + const QnnGraph_Config_t ** graph_configs = nullptr) { + int result = 0; + + if (nullptr == graph_name) { + QNN_LOG_WARN("graph name is null\n"); + return 1; + } + + if (!_graph_name.empty()) { + QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); + return 2; + } + + if (!do_node_validation) { + QNN_LOG_WARN("node validation disabled, backend will not perform op " + "validation prior to adding node\n"); + } + + _graph_name = graph_name; + _debug_tensor = debug; + _do_node_validations = do_node_validation; + + result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, + graph_configs, &_qnn_graph_handle); + if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { + QNN_LOG_WARN("failed to create graph in qnn context\n"); + return 3; + } else { + QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); + } + + return 0; + } + + int finalize_qnn_graph() { + if (nullptr != _qnn_graph_handle) { + if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, + _qnn_profile_handle, + nullptr) != QNN_GRAPH_NO_ERROR) { + QNN_LOG_WARN("finalizing graph failure\n"); + } + } else { + QNN_LOG_DEBUG("qnn graph handle is null\n"); + } + + return 0; + } + + const qnn_interface & get_qnn_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_interface; + } + + const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_interface; + } + + const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() { + if (!_qnn_interface.is_loaded()) { + QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + } + return _qnn_raw_system_interface; + } + + const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; } + + const Qnn_ProfileHandle_t get_qnn_profile_handle() { + return _qnn_profile_handle; + } + + const Qnn_DeviceHandle_t get_qnn_device_handle() { + return _qnn_device_handle; + } + + const Qnn_BackendHandle_t get_qnn_backend_handle() { + return _qnn_backend_handle; + } const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; @@ -845,12 +1319,6 @@ class qnn_instance { const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; } - int init_qnn_graph(const char * graph_name, bool debug, - uint8_t do_node_validation = 1, - const QnnGraph_Config_t ** graph_configs = nullptr); - - int finalize_qnn_graph(); - int init_htp_perfinfra() { QnnDevice_Infrastructure_t device_infra = nullptr; int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra); @@ -945,793 +1413,416 @@ class qnn_instance { _rpcmem_initialized = initialized; } - int32_t rpcmem_to_fd(void * buf); - - int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor); - - void unregister_rpcmem(); - - void * alloc_rpcmem(size_t bytes, size_t alignment); - - void free_rpcmem(void * buf); - size_t get_rpcmem_capacity() { return _rpcmem_capacity; } - bool is_rpcmem_allocated(void * buf); - bool is_rpcmem_registered(Qnn_MemHandle_t handle) { return _qnn_mem_set.count(handle) != 0U; } - public: - std::map> - _qnn_graph_map; - - private: - int load_system(); - - int unload_system(); - - int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config); - - int unload_backend(); - - void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_interface = raw_interface; - } - - void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { - _qnn_raw_system_interface = raw_interface; - } - - private: - static constexpr const int _required_num_providers = 1; - - private: - std::string _lib_path; - std::string _backend_name; - std::string _model_name; // prebuilt QNN model name, not used currently - BackendIdType _backend_id; - - bool _debug_tensor = false; // flag to indicate if requested graph is to be run in debug mode - bool _do_node_validations = true; // flag to indicate whether all add_node - // calls need to be validated - QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - - ggml_qnn_profile_level _profile_level = ggml_qnn_profile_level::profile_detail; - - qnn_interface _qnn_interface; - - void * _system_lib_handle = nullptr; - - Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - - Qnn_LogHandle_t _qnn_log_handle = nullptr; - - Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; - - Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - - Qnn_BackendHandle_t _qnn_backend_handle = nullptr; - - Qnn_ContextHandle_t _qnn_context_handle = nullptr; - - QnnSystemContext_Handle_t _qnn_system_handle = nullptr; - - QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; - uint32_t _qnn_power_configid = 1; - uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing - - QNN_INTERFACE_VER_TYPE _qnn_raw_interface; - QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - - std::unordered_set _qnn_mem_set; - - std::mutex _init_mutex; - std::unordered_map _loaded_lib_handle; - std::unordered_map _lib_path_to_backend_id; - std::unordered_map _loaded_backend; - - void * _rpc_lib_handle = nullptr; - std::atomic_bool _rpcmem_initialized{false}; - pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; - pfn_rpc_mem_free _pfn_rpc_mem_free; - pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; - pfn_rpc_mem_init _pfn_rpc_mem_init; - pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; - std::unordered_map _rpcmem_store_map; - size_t _rpcmem_capacity = 512; - - std::string _graph_name; -}; - -// ================================================================================================= -// -// implementation of QNN wrapper class -// -// ================================================================================================= -void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return nullptr; - } - - auto allocate_bytes = static_cast(bytes + alignment); - void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, - allocate_bytes); - if (buf == nullptr) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - return nullptr; - } - - auto aligned_buf = reinterpret_cast( - align_to(alignment, reinterpret_cast(buf))); - bool status = - _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; - if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); - _pfn_rpc_mem_free(buf); - } - - return aligned_buf; -} - -void qnn_instance::free_rpcmem(void * buf) { - if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else if (0 == _rpcmem_store_map.count(buf)) { - QNN_LOG_WARN("no allocated tensor\n"); - } else { - _pfn_rpc_mem_free(_rpcmem_store_map[buf]); - _rpcmem_store_map.erase(buf); - } -} + void * alloc_rpcmem(size_t bytes, size_t alignment) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return nullptr; + } -int32_t qnn_instance::rpcmem_to_fd(void * buf) { - int32_t mem_fd = -1; - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - } else { - mem_fd = _pfn_rpc_mem_to_fd(buf); - } + auto allocate_bytes = static_cast(bytes + alignment); + void * buf = __pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + allocate_bytes); + if (buf == nullptr) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + return nullptr; + } - return mem_fd; -} + auto aligned_buf = reinterpret_cast( + align_to(alignment, reinterpret_cast(buf))); + bool status = + _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; + if (!status) { + QNN_LOG_WARN("failed to allocate rpc memory\n"); + __pfn_rpc_mem_free(buf); + } -int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { - if (nullptr == p_data || (nullptr == p_tensor)) { - QNN_LOG_WARN("invalid param\n"); - return 1; + return aligned_buf; } - if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); - return 2; + void free_rpcmem(void * buf) { + if (!_rpcmem_initialized) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else if (0 == _rpcmem_store_map.count(buf)) { + QNN_LOG_WARN("no allocated tensor\n"); + } else { + __pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _rpcmem_store_map.erase(buf); + } } - if (is_rpcmem_allocated(p_data)) { - QNN_LOG_WARN("rpc memory already allocated\n"); - // return 3; - } - if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { - QNN_LOG_WARN("tensor %s has been registered shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - return 4; - } + int32_t rpcmem_to_fd(void * buf) { + int32_t mem_fd = -1; + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + } else { + mem_fd = __pfn_rpc_mem_to_fd(buf); + } - int32_t mem_fd = rpcmem_to_fd(p_data); - if (-1 == mem_fd) { - QNN_LOG_WARN("failed to get file descriptor\n"); - return 5; + return mem_fd; } - QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); - Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, - QNN_VER_PTR(*p_tensor)->dimensions, - nullptr}, - QNN_VER_PTR(*p_tensor)->dataType, - QNN_MEM_TYPE_ION, - {{mem_fd}}}; - Qnn_MemHandle_t handle = nullptr; - int error = QNN_SUCCESS; - error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, - /*numDescriptors=*/1, &handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", - QNN_GET_ERROR_CODE(error), strerror(error)); - return 6; - } else { - QNN_LOG_INFO("tensor %s successfully register shared memory\n", - (QNN_VER_PTR(*p_tensor)->name)); - } - QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert(handle); - - return 0; -} -void qnn_instance::unregister_rpcmem() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; + int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) { + if (nullptr == p_data || (nullptr == p_tensor)) { + QNN_LOG_WARN("invalid param\n"); + return 1; + } - if (_qnn_mem_set.empty()) { - QNN_LOG_WARN("no rpcmem registered\n"); - } + if (!is_rpcmem_initialized()) { + QNN_LOG_WARN("rpc memory not initialized\n"); + return 2; + } - for (auto &mem_handle : _qnn_mem_set) { - error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (is_rpcmem_allocated(p_data)) { + QNN_LOG_WARN("rpc memory already allocated\n"); + // return 3; + } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { + QNN_LOG_WARN("tensor %s has been registered shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); + return 4; + } + + int32_t mem_fd = rpcmem_to_fd(p_data); + if (-1 == mem_fd) { + QNN_LOG_WARN("failed to get file descriptor\n"); + return 5; + } + QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, + QNN_VER_PTR(*p_tensor)->dimensions, + nullptr}, + QNN_VER_PTR(*p_tensor)->dataType, + QNN_MEM_TYPE_ION, + {{mem_fd}}}; + Qnn_MemHandle_t handle = nullptr; + int error = QNN_SUCCESS; + error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, + /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", + QNN_GET_ERROR_CODE(error), strerror(error)); + return 6; + } else { + QNN_LOG_INFO("tensor %s successfully register shared memory\n", + (QNN_VER_PTR(*p_tensor)->name)); } - } - _qnn_mem_set.clear(); -} - -bool qnn_instance::is_rpcmem_allocated(void * buf) { - return _rpcmem_store_map.count(buf) != 0U; -} - -int qnn_instance::load_backend(std::string & lib_path, - const QnnSaver_Config_t ** saver_config) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + QNN_VER_PTR(*p_tensor)->memHandle = handle; + _qnn_mem_set.insert(handle); - void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); - if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", - lib_path.c_str(), dlerror()); - return 1; - } - - auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>( - lib_handle, "QnnInterface_getProviders"); - if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", - dlerror()); - return 2; + return 0; } - std::uint32_t num_providers = 0; - const QnnInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", - QNN_GET_ERROR_CODE(error)); - return 3; - } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, - _required_num_providers); - return 4; - } + void unregister_rpcmem() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr == provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); - return 5; - } - bool found_valid_interface = false; - QNN_INTERFACE_VER_TYPE qnn_interface; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_API_VERSION_MAJOR == - provider_list[idx]->apiVersion.coreApiVersion.major && - QNN_API_VERSION_MINOR <= - provider_list[idx]->apiVersion.coreApiVersion.minor) { - found_valid_interface = true; - qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; - break; + if (_qnn_mem_set.empty()) { + QNN_LOG_WARN("no rpcmem registered\n"); } - } - if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn interface\n"); - } - set_qnn_raw_interface(qnn_interface); - - BackendIdType backend_id = provider_list[0]->backendId; - _lib_path_to_backend_id[lib_path] = backend_id; - if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); - } - _loaded_backend[backend_id] = provider_list[0]; - if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); - int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); - if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", - _loaded_lib_handle[backend_id], dlerror()); + for (auto & mem_handle : _qnn_mem_set) { + error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to unregister shared memory, error %d\n", + QNN_GET_ERROR_CODE(error)); + } } + _qnn_mem_set.clear(); } - _loaded_lib_handle[backend_id] = lib_handle; - _backend_id = backend_id; - return 0; -} - -int qnn_instance::unload_backend() { - int dlclose_error = 0; - for (auto & it : _loaded_lib_handle) { - dlclose_error = dlclose(it.second); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, - dlerror()); - } + bool is_rpcmem_allocated(void * buf) { + return _rpcmem_store_map.count(buf) != 0U; } - _loaded_lib_handle.clear(); - _lib_path_to_backend_id.clear(); - _loaded_backend.clear(); - return 0; -} + public: + std::map> + _qnn_graph_map; -int qnn_instance::load_system() { - Qnn_ErrorHandle_t error = QNN_SUCCESS; + private: + int load_system() { + Qnn_ErrorHandle_t error = QNN_SUCCESS; - std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + std::string system_lib_path = _lib_path + "libQnnSystem.so"; + QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); - _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); - if (nullptr == _system_lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, error: %s\n", - system_lib_path.c_str(), dlerror()); - return 1; - } + _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL); + if (nullptr == _system_lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, error: %s\n", + system_lib_path.c_str(), dlerror()); + return 1; + } - auto * get_providers = - reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>( - dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); - if (nullptr == get_providers) { - QNN_LOG_WARN( - "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", - dlerror()); - return 2; - } + auto * get_providers = + reinterpret_cast<_pfn_qnnsysteminterface_getproviders *>( + dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); + if (nullptr == get_providers) { + QNN_LOG_WARN( + "can not load QNN symbol QnnSystemInterface_getProviders: %s\n", + dlerror()); + return 2; + } - uint32_t num_providers = 0; - const QnnSystemInterface_t ** provider_list = nullptr; - error = get_providers(&provider_list, &num_providers); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", - QNN_GET_ERROR_CODE(error)); - return 3; - } + uint32_t num_providers = 0; + const QnnSystemInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d\n", + QNN_GET_ERROR_CODE(error)); + return 3; + } - if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, - _required_num_providers); - return 4; - } + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, + _required_num_providers); + return 4; + } - if (nullptr == provider_list) { - QNN_LOG_WARN("can not get providers\n"); - return 5; - } + if (nullptr == provider_list) { + QNN_LOG_WARN("can not get providers\n"); + return 5; + } - QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; - bool found_valid_system_interface = false; - for (size_t idx = 0; idx < num_providers; idx++) { - if (QNN_SYSTEM_API_VERSION_MAJOR == + QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface; + bool found_valid_system_interface = false; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_SYSTEM_API_VERSION_MAJOR == provider_list[idx]->systemApiVersion.major && - QNN_SYSTEM_API_VERSION_MINOR <= + QNN_SYSTEM_API_VERSION_MINOR <= provider_list[idx]->systemApiVersion.minor) { - found_valid_system_interface = true; - qnn_system_interface = - provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; - break; + found_valid_system_interface = true; + qnn_system_interface = + provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME; + break; + } } - } - if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); - return 6; - } else { - QNN_LOG_INFO("find a valid qnn system interface\n"); - } - set_qnn_raw_system_interface(qnn_system_interface); - - _qnn_interface.set_qnn_system_interface(provider_list[0]); - - _qnn_interface.qnn_system_context_create(&_qnn_system_handle); - if (nullptr == _qnn_system_handle) { - QNN_LOG_WARN("can not create QNN system contenxt\n"); - } else { - QNN_LOG_INFO("initialize qnn system successfully\n"); - } - - return 0; -} - -int qnn_instance::unload_system() { - int result = 0; + if (!found_valid_system_interface) { + QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn system interface\n"); + } + set_qnn_raw_system_interface(qnn_system_interface); - if (nullptr == _system_lib_handle) { - QNN_LOG_DEBUG("system lib handle is null\n"); - return 1; - } + _qnn_interface.set_qnn_system_interface(provider_list[0]); - if (nullptr != _qnn_system_handle) { - result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); - if (result != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN system context\n"); + _qnn_interface.qnn_system_context_create(&_qnn_system_handle); + if (nullptr == _qnn_system_handle) { + QNN_LOG_WARN("can not create QNN system contenxt\n"); + } else { + QNN_LOG_INFO("initialize qnn system successfully\n"); } - _qnn_system_handle = nullptr; - } - int dlclose_error = dlclose(_system_lib_handle); - if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", - dlerror()); - return 2; + return 0; } - _system_lib_handle = nullptr; - - return result; -} + int unload_system() { + int result = 0; -static void ggml_qnn_logcallback(const char * fmt, QnnLog_Level_t level, - uint64_t timestamp, va_list argp) { + if (nullptr == _system_lib_handle) { + QNN_LOG_DEBUG("system lib handle is null\n"); + return 1; + } -#if ENABLE_QNN_LOG - static std::mutex log_mutex; - static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN]; + if (nullptr != _qnn_system_handle) { + result = _qnn_interface.qnn_system_context_free(_qnn_system_handle); + if (result != QNN_SUCCESS) { + QNN_LOG_WARN("failed to free QNN system context\n"); + } + _qnn_system_handle = nullptr; + } - const char * log_level_desc = ""; - switch (level) { - case QNN_LOG_LEVEL_ERROR: - log_level_desc = "ERROR"; - break; - case QNN_LOG_LEVEL_WARN: - log_level_desc = "WARNING"; - break; - case QNN_LOG_LEVEL_INFO: - log_level_desc = "INFO"; - break; - case QNN_LOG_LEVEL_DEBUG: - log_level_desc = "DEBUG"; - break; - case QNN_LOG_LEVEL_VERBOSE: - log_level_desc = "VERBOSE"; - break; - case QNN_LOG_LEVEL_MAX: - log_level_desc = "UNKNOWN"; - break; - } + int dlclose_error = dlclose(_system_lib_handle); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", + dlerror()); + return 2; + } - double ms = (double) timestamp / 1000000.0; - { - std::lock_guard lock(log_mutex); + _system_lib_handle = nullptr; - memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); - vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + return result; } -#endif -} -int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) { - BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); + int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) { + Qnn_ErrorHandle_t error = QNN_SUCCESS; + QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); - std::lock_guard lock(_init_mutex); - - if (0 != load_system()) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); - return 1; - } else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); - } + void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); + if (nullptr == lib_handle) { + QNN_LOG_WARN("can not open QNN library %s, with error: %s", + lib_path.c_str(), dlerror()); + return 1; + } - std::string backend_lib_path = _lib_path + _backend_name; - if (0 == _lib_path_to_backend_id.count(backend_lib_path)) { - int is_load_ok = load_backend(backend_lib_path, saver_config); - if (0 != is_load_ok) { - QNN_LOG_WARN("failed to load QNN backend\n"); + auto get_providers = load_qnn_functionpointers<_pfn_qnninterface_getproviders *>( + lib_handle, "QnnInterface_getProviders"); + if (nullptr == get_providers) { + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", + dlerror()); return 2; } - } - - backend_id = _lib_path_to_backend_id[backend_lib_path]; - if (0 == _loaded_backend.count(backend_id) || - 0 == _loaded_lib_handle.count(backend_id)) { - QNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", - backend_lib_path.c_str(), _loaded_backend.count(backend_id), - _loaded_lib_handle.count(backend_id)); - return 3; - } - - _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - - _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, - &_qnn_log_handle); - if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN( - "why failed to initialize qnn log\n"); // NPU backend not work on - // Qualcomm SoC equipped low-end phone - return 4; - } else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); - } - std::vector temp_backend_config; - _qnn_interface.qnn_backend_create( - _qnn_log_handle, - temp_backend_config.empty() ? nullptr : temp_backend_config.data(), - &_qnn_backend_handle); - if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); - return 5; - } else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); - } - - if (nullptr != _qnn_raw_interface.propertyHasCapability) { - auto qnnStatus = - _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { - QNN_LOG_WARN("device property is not supported\n"); + std::uint32_t num_providers = 0; + const QnnInterface_t ** provider_list = nullptr; + error = get_providers(&provider_list, &num_providers); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("failed to get providers, error %d", + QNN_GET_ERROR_CODE(error)); + return 3; } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { - QNN_LOG_WARN("device property is not known to backend\n"); + QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + if (num_providers != _required_num_providers) { + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, + _required_num_providers); + return 4; } - } - Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, - &_qnn_device_handle); - if (QNN_SUCCESS != qnn_status && - QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); - } else { - QNN_LOG_INFO("create device successfully\n"); - } - - if (ggml_qnn_profile_level::profile_off != _profile_level) { - QNN_LOG_INFO("profiling turned on; level = %d", _profile_level); - if (ggml_qnn_profile_level::profile_basic == _profile_level) { - QNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_BASIC, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 6; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } - } else if (ggml_qnn_profile_level::profile_detail == _profile_level) { - QNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n"); - if (QNN_PROFILE_NO_ERROR != - _qnn_raw_interface.profileCreate(_qnn_backend_handle, - QNN_PROFILE_LEVEL_DETAILED, - &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); - return 7; - } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); - } + if (nullptr == provider_list) { + QNN_LOG_WARN("failed to get qnn interface providers\n"); + return 5; } - } - - _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL); - if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror()); - return 8; - } else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); - set_rpcmem_initialized(true); - } - _pfn_rpc_mem_init = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_init")); - _pfn_rpc_mem_deinit = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_deinit")); - _pfn_rpc_mem_alloc = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_alloc")); - _pfn_rpc_mem_free = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_free")); - _pfn_rpc_mem_to_fd = reinterpret_cast( - dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || - nullptr == _pfn_rpc_mem_to_fd) { - QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); - dlclose(_rpc_lib_handle); - return 9; - } - - if (nullptr != - _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_init(); - - std::vector temp_context_config; - _qnn_interface.qnn_context_create( - _qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr : temp_context_config.data(), - &_qnn_context_handle); - if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); - return 10; - } else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); - } - - if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t * p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t chiparch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d, vtcm_size_in_mb:%d MB", chipinfo.socModel, - qnn_get_chipset_desc(chipinfo.socModel), chiparch, chipinfo.vtcmSize); - } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - - - //TODO: faster approach to probe the accurate capacity of rpc ion memory - size_t candidate_size = 0; - uint8_t * rpc_buffer = nullptr; - const int SIZE_IN_MB = (1 << 20); - size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; - size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); - for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); - if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + bool found_valid_interface = false; + QNN_INTERFACE_VER_TYPE qnn_interface; + for (size_t idx = 0; idx < num_providers; idx++) { + if (QNN_API_VERSION_MAJOR == + provider_list[idx]->apiVersion.coreApiVersion.major && + QNN_API_VERSION_MINOR <= + provider_list[idx]->apiVersion.coreApiVersion.minor) { + found_valid_interface = true; + qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME; break; - } else { - candidate_size = probe_slots[idx]; - free_rpcmem(rpc_buffer); - rpc_buffer = nullptr; } } - if (candidate_size > _rpcmem_capacity) - _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); - } - - QNN_LOG_DEBUG("leave qni_init\n"); - - return 0; -} - -int qnn_instance::qnn_finalize() { - int ret_status = 0; - Qnn_ErrorHandle_t error = QNN_SUCCESS; - - if (nullptr != - _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - _pfn_rpc_mem_deinit(); - - if (dlclose(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); - } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); - } - - if (nullptr != _qnn_context_handle) { - error = _qnn_interface.qnn_context_free(_qnn_context_handle, - _qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_context_handle = nullptr; - } - if (nullptr != _qnn_profile_handle) { - error = _qnn_interface.qnn_profile_free(_qnn_profile_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); + if (!found_valid_interface) { + QNN_LOG_WARN("unable to find a valid qnn interface\n"); + return 6; + } else { + QNN_LOG_INFO("find a valid qnn interface\n"); + } + set_qnn_raw_interface(qnn_interface); + + BackendIdType backend_id = provider_list[0]->backendId; + _lib_path_to_backend_id[lib_path] = backend_id; + if (_loaded_backend.count(backend_id) > 0) { + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", + lib_path.c_str(), backend_id); + } + _loaded_backend[backend_id] = provider_list[0]; + if (_loaded_lib_handle.count(backend_id) > 0) { + QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); + if (dlclose_error != 0) { + QNN_LOG_WARN("fail to close %p with error %s\n", + _loaded_lib_handle[backend_id], dlerror()); + } } - _qnn_profile_handle = nullptr; + _loaded_lib_handle[backend_id] = lib_handle; + _backend_id = backend_id; + + return 0; } - if (nullptr != _qnn_device_handle) { - error = _qnn_interface.qnn_device_free(_qnn_device_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); + int unload_backend() { + int dlclose_error = 0; + for (auto & it : _loaded_lib_handle) { + dlclose_error = dlclose(it.second); + if (dlclose_error != 0) { + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, + dlerror()); + } } - _qnn_device_handle = nullptr; + + _loaded_lib_handle.clear(); + _lib_path_to_backend_id.clear(); + _loaded_backend.clear(); + + return 0; } - if (nullptr != _qnn_backend_handle) { - error = _qnn_interface.qnn_backend_free(_qnn_backend_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_backend_handle = nullptr; + void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_interface = raw_interface; } - if (nullptr != _qnn_log_handle) { - error = _qnn_interface.qnn_log_free(_qnn_log_handle); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", - _qnn_interface.get_backend_id(), - QNN_GET_ERROR_CODE(error)); - } - _qnn_log_handle = nullptr; + void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) { + _qnn_raw_system_interface = raw_interface; } - unload_backend(); + private: + static constexpr const int _required_num_providers = 1; - unload_system(); + private: + std::string _lib_path; + std::string _backend_name; + std::string _model_name; // prebuilt QNN model name, not used currently + BackendIdType _backend_id; - return ret_status; -} + bool _debug_tensor = false; + bool _do_node_validations = true; -int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, - uint8_t do_node_validation, - const QnnGraph_Config_t ** graph_configs) { - int result = 0; + QnnLog_Level_t _qnn_log_level = QNN_LOG_LEVEL_DEBUG; - if (nullptr == graph_name) { - QNN_LOG_WARN("graph name is null\n"); - return 1; - } + qnn_sdk_profile_level _profile_level = qnn_sdk_profile_level::profile_detail; - if (!_graph_name.empty()) { - QNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name); - return 2; - } + qnn_interface _qnn_interface; - if (!do_node_validation) { - QNN_LOG_WARN("node validation disabled, backend will not perform op " - "validation prior to adding node\n"); - } + void * _system_lib_handle = nullptr; - _graph_name = graph_name; - _debug_tensor = debug; - _do_node_validations = do_node_validation; + Qnn_GraphHandle_t _qnn_graph_handle = nullptr; - result = _qnn_raw_interface.graphCreate(_qnn_context_handle, graph_name, - graph_configs, &_qnn_graph_handle); - if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) { - QNN_LOG_WARN("failed to create graph in qnn context\n"); - return 3; - } else { - QNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle); - } + Qnn_LogHandle_t _qnn_log_handle = nullptr; - return 0; -} + Qnn_ProfileHandle_t _qnn_profile_handle = nullptr; -int qnn_instance::finalize_qnn_graph() { - if (nullptr != _qnn_graph_handle) { - if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle, - _qnn_profile_handle, - nullptr) != QNN_GRAPH_NO_ERROR) { - QNN_LOG_WARN("finalizing graph failure\n"); - } - } else { - QNN_LOG_DEBUG("qnn graph handle is null\n"); - } + Qnn_DeviceHandle_t _qnn_device_handle = nullptr; - return 0; -} + Qnn_BackendHandle_t _qnn_backend_handle = nullptr; + + Qnn_ContextHandle_t _qnn_context_handle = nullptr; + + QnnSystemContext_Handle_t _qnn_system_handle = nullptr; + + QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; + uint32_t _qnn_power_configid = 1; + uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing + + QNN_INTERFACE_VER_TYPE _qnn_raw_interface; + QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; + + std::unordered_set _qnn_mem_set; + + std::mutex _init_mutex; + std::unordered_map _loaded_lib_handle; + std::unordered_map _lib_path_to_backend_id; + std::unordered_map _loaded_backend; + + void * _rpc_lib_handle = nullptr; + std::atomic_bool _rpcmem_initialized{false}; + _pfn_rpc_mem_alloc __pfn_rpc_mem_alloc; + _pfn_rpc_mem_free __pfn_rpc_mem_free; + _pfn_rpc_mem_to_fd __pfn_rpc_mem_to_fd; + _pfn_rpc_mem_init __pfn_rpc_mem_init; + _pfn_rpc_mem_deinit __pfn_rpc_mem_deinit; + std::unordered_map _rpcmem_store_map; + size_t _rpcmem_capacity = 512; + + std::string _graph_name; +}; // ================================================================================================= // -// implementation of GGML's QNN backend +// implementation of QNN backend for GGML // // ================================================================================================= -static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, - const struct ggml_tensor *tensor, +static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, + const struct ggml_tensor * tensor, bool b_dump_tensor_info) { // only support the following 3 OPs currently // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend @@ -1739,23 +1830,18 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, // which the backend's ggml_backend_xxx_buffer_is_host return true. // this approach could be found: // https://github.com/ggerganov/llama.cpp/pull/7641 - // - // ensure tensor->src[0] and tensor->src[1] is not nullptr. - bool supported_op = - ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || - (tensor->op == GGML_OP_MUL_MAT)); + bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) + || (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } + const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; - const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne20 = tensor->ne[0]; const int64_t ne21 = tensor->ne[1]; @@ -1801,15 +1887,11 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, return false; } - // make ggml_get_tensor_rank and QNN SDK happy + // make qnn_get_ggml_tensor_rank and QNN SDK happy if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { return false; } - if ((ne20 < 32) || (ne21 < 32) || (ne10 < 32)) { - return false; - } - int qtype = src0->type; if (tensor->op == GGML_OP_ADD) { return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || @@ -1837,75 +1919,32 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context *ctx, } } + static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_add"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - enum ggml_op ggmlop = GGML_OP_ADD; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("pls check why GGML tensor is null"); - return; - } + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_add"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_ADD; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + CHECK_PARAMS(ctx, src0, src1, dst); tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || - (nullptr == tensor_2)) { - QNN_LOG_WARN("pls check why QNN tensor is null"); - return; - } - if (nullptr == ctx) { - QNN_LOG_WARN("pls check why backend ctx is null"); - return; - } instance = ctx->instance; - if (nullptr == instance) { - QNN_LOG_WARN("pls check why qnn instance is null"); - return; - } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - n_begin_time = ggml_time_us(); - - if (0) { - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), - dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], - dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - } + qnn_perf perf("ggml_qnn_add"); + perf.start(); + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -1947,36 +1986,39 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); - return; + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; @@ -1990,17 +2032,19 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } - error = - qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; @@ -2011,8 +2055,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - // QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], - // src0->ne[2], src0->ne[3]); uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2024,38 +2066,61 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = - qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, tensor_outputs,1, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } } +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), + dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], + dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_add : %lld milliseconds\n", n_duration); + + perf.info(); } /* @@ -2074,69 +2139,32 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - - qnn_instance * instance = nullptr; - std::string graph_name = "ggml_op_qnn_mul_mat"; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - - Qnn_Param_t qnn_params[] = {}; - - enum ggml_op ggmlop = GGML_OP_MUL_MAT; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("pls check why GGML tensor is null"); - return; - } + qnn_instance * instance = nullptr; + std::string graph_name = "ggml_op_qnn_mul_mat"; + Qnn_GraphHandle_t graph_handle = nullptr; + Qnn_Tensor_t * tensor_0 = nullptr; + Qnn_Tensor_t * tensor_1 = nullptr; + Qnn_Tensor_t * tensor_2 = nullptr; + Qnn_Param_t qnn_params[] = {}; + enum ggml_op ggmlop = GGML_OP_MUL_MAT; + Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; + Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; + + CHECK_PARAMS(ctx, src0, src1, dst); tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || - (nullptr == tensor_2)) { - QNN_LOG_WARN("pls check why QNN tensor is null"); - return; - } - if (nullptr == ctx) { - QNN_LOG_WARN("pls check why backend ctx is null"); - return; - } instance = ctx->instance; - if (nullptr == instance) { - QNN_LOG_WARN("pls check why qnn instance is null"); - return; - } - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_mul_mat"); + perf.start(); - n_begin_time = ggml_time_us(); - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2178,36 +2206,39 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); - return; + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; @@ -2220,10 +2251,12 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2231,6 +2264,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; @@ -2241,7 +2275,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2252,41 +2285,60 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = - qnn_raw_interface.graphExecute(graph_handle, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } } +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_mul_mat : %lld milliseconds\n", - n_duration); - QNN_LOG_DEBUG("call %s done\n", __func__); + perf.info(); } // common function for GGML OPs using QNN API @@ -2296,10 +2348,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; bool graph_initialized = false; - int64_t n_begin_time = 0LL; - int64_t n_end_time = 0LL; - int64_t n_duration = 0LL; - qnn_instance * instance = nullptr; std::string qnn_graph_name = "ggml_qnn_graph"; std::string qnn_op_config_name = "ggml_qnn_op_config"; @@ -2308,73 +2356,39 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, Qnn_Tensor_t * tensor_0 = nullptr; Qnn_Tensor_t * tensor_1 = nullptr; Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - if ((nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { - QNN_LOG_WARN("pls check why GGML tensor is null"); - return; - } + CHECK_PARAMS(ctx, src0, src1, dst); tensor_0 = (Qnn_Tensor_t *) src0->extra; tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; - if ((nullptr == tensor_0) || (nullptr == tensor_1) || - (nullptr == tensor_2)) { - QNN_LOG_WARN("pls check why QNN tensor is null"); - return; - } - if (nullptr == ctx) { - QNN_LOG_WARN("pls check why backend ctx is null"); - return; - } instance = ctx->instance; - if (nullptr == instance) { - QNN_LOG_WARN("pls check why qnn instance is null"); + qnn_perf perf(ggml_op_name(ggmlop)); + perf.start(); + + qnn_op_name = qnn_opname_from_ggmlop(ggmlop); + if (nullptr == qnn_op_name) { + QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop)); return; } + + tensor_0 = (Qnn_Tensor_t *) src0->extra; + tensor_1 = (Qnn_Tensor_t *) src1->extra; + tensor_2 = (Qnn_Tensor_t *) dst->extra; + instance = ctx->instance; QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); - if (nullptr == qnn_op_name) { - QNN_LOG_WARN( - "pls check why can not get QNN OP name with ggml op %d(%s)", ggmlop, - ggml_op_name(ggmlop)); - return; - } - - n_begin_time = ggml_time_us(); - - QNN_LOG_DEBUG("call %s\n", __func__); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; + uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2413,37 +2427,40 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " "name %s, error = %d\n", ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); - return; + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; @@ -2456,10 +2473,12 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, @@ -2467,6 +2486,7 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; @@ -2477,7 +2497,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, tensor_1 = std::get<2>(graph_item); tensor_2 = std::get<3>(graph_item); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); uint32_t dimensions_input_0[] = { (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; @@ -2488,21 +2507,21 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0); + QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1); + QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst); + QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - ggml_get_tensor_data_size(src0)}; + qnn_get_ggml_tensor_data_size(src0)}; QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - ggml_get_tensor_data_size(src1)}; + qnn_get_ggml_tensor_data_size(src1)}; QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - ggml_get_tensor_data_size(dst)}; + qnn_get_ggml_tensor_data_size(dst)}; Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; @@ -2513,16 +2532,36 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); + goto failure; } } +failure: + if (QNN_SUCCESS != error) { + QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); + QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); + QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src0->name, src0->type, ggml_type_name(src0->type), + src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], + src0->nb[1], src0->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + src1->name, src1->type, ggml_type_name(src1->type), + src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], + src1->nb[1], src1->nb[2]); + QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 + " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", + dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], + dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); + QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], + src0->ne[3]); + } + QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - n_end_time = ggml_time_us(); - n_duration = (n_end_time - n_begin_time) / 1000; - QNN_LOG_DEBUG("duration of ggml_qnn_%s : %lld milliseconds\n", - ggml_op_name(ggmlop), n_duration); - QNN_LOG_DEBUG("call %s done\n", __func__); + perf.info(); } static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, @@ -2829,44 +2868,6 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, return true; } -struct ggml_backend_qnn_buffer_context { - ggml_backend_qnn_buffer_context(size_t device) - : device(device) - , name(GGML_QNN_NAME + std::to_string(device)) {} - - ~ggml_backend_qnn_buffer_context() { - if (buffer) { - free(buffer); - } - - for (auto * sub_buffer : sub_buffers) { - free(sub_buffer); - } - - for (auto * qnn_tensor : qnn_tensors) { - free_qnn_tensor(*qnn_tensor); - free(qnn_tensor); - } - - sub_buffers.clear(); - qnn_tensors.clear(); - } - void * buffer = nullptr; - - struct ggml_backend_qnn_context * backend_ctx = nullptr; - - size_t buffer_size = 0; - std::vector sub_buffers; - std::vector qnn_tensors; - size_t device; - std::string name; -}; - -struct ggml_backend_qnn_buffer_type_context { - size_t device; - std::string name; -}; - static const char * ggml_backend_qnn_buffer_get_name(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return "QNN"; @@ -2922,7 +2923,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t QNN_QUANTIZATION_ENCODING_UNDEFINED, {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = ggml_get_tensor_rank(tensor), + .rank = qnn_get_ggml_tensor_rank(tensor), .dimensions = dimensions, .memType = QNN_TENSORMEMTYPE_RAW, {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; @@ -3122,7 +3123,7 @@ GGML_CALL static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) { ggml_backend_qnn_context *ctx = (ggml_backend_qnn_context *) backend->context; - return (ggml_qnn_can_handle_op(ctx, op, true)); + return (ggml_qnn_can_handle_op(ctx, op, false)); } GGML_CALL static bool ggml_backend_qnn_offload_op(ggml_backend_t backend,const ggml_tensor * tensor) { @@ -3213,14 +3214,13 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { return nullptr; } - //ref:https://github.com/zhouwg/llama.cpp/pull/1 static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; if (!ggml_backend_qnn_buffer_type_initialized) { for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { auto & context = ggml_backend_qnn_buffer_type_contexts[i]; - context = { i, std::string(GGML_QNN_NAME) + std::to_string(i) }; + context = { i, std::string(QNN_BACKEND_NAME) + std::to_string(i) }; ggml_backend_qnn_buffer_types[i] = { /* .iface = */ { /* .get_name = */ ggml_backend_qnn_buffer_type_name, @@ -3285,10 +3285,10 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { } else { if (0 == setenv("LD_LIBRARY_PATH", path.c_str(), 1)) { QNN_LOG_INFO("%s backend setenv successfully\n", - get_qnn_backend_name(device)); + qnn_get_backend_name(device)); } else { QNN_LOG_ERROR("%s backend setenv failure\n", - get_qnn_backend_name(device)); + qnn_get_backend_name(device)); } } @@ -3298,7 +3298,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { if (0 != result) { QNN_LOG_WARN( "init qnn subsystem failed with qnn backend %s, pls check why\n", - get_qnn_backend_name(device)); + qnn_get_backend_name(device)); delete instance; return nullptr; } @@ -3309,7 +3309,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { return nullptr; } - std::string device_name = get_qnn_backend_name(device); + std::string device_name = qnn_get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); instance->init_qnn_graph(device_name.c_str(), false); g_qnn_mgr[device].instance = instance; From 5269e082aa479de382fefde7518a84036c1b6b7f Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Tue, 11 Jun 2024 23:05:00 +0800 Subject: [PATCH 14/16] ggml-qnn: refine ggml inference using QNN NPU --- ggml-qnn.cpp | 250 ++++++++++++------------ tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 10 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 42 ++-- 3 files changed, 149 insertions(+), 153 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 43a8fcd3ea8cb..4700e145112d6 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -55,6 +55,7 @@ #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" +#include // ================================================================================================= // @@ -72,9 +73,16 @@ static int free_qnn_tensor(Qnn_Tensor_t & tensor); // self-defined macro / data structure // // ================================================================================================= -#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#ifdef NDEBUG +#define ENABLE_QNNBACKEND_DEBUG 0 // for troubleshooting QNN backend #define ENABLE_QNNSDK_LOG 0 // enable/disable QNN SDK's internal log #define ENABLE_QNNBACKEND_PERF 0 // enable/disable op's perf info +#else +#define ENABLE_QNNBACKEND_DEBUG 1 // for troubleshooting QNN backend +#define ENABLE_QNNSDK_LOG 1 // enable/disable QNN SDK's internal log +#define ENABLE_QNNBACKEND_PERF 1 // enable/disable op's perf info +#endif + #define QNN_LOGBUF_LEN 4096 #define QNN_BACKEND_NAME "qnn" @@ -393,7 +401,6 @@ static void qnn_internal_log(ggml_log_level level, const char * file, } } - static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) { @@ -438,8 +445,8 @@ class qnn_perf { void info() { _end_time = ggml_time_us(); - _duration = (_end_time - _begin_time) / 1000; - QNN_LOG_DEBUG("duration of %s : %lld milliseconds\n", _perf_name.c_str(), _duration); + _duration = (_end_time - _begin_time); + QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } private: @@ -473,15 +480,15 @@ enum qnn_sdk_profile_level { profile_detail = 2 }; -using _pfn_rpc_mem_init = void (*)(void); -using _pfn_rpc_mem_deinit = void (*)(void); -using _pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); -using _pfn_rpc_mem_free = void (*)(void *); -using _pfn_rpc_mem_to_fd = int (*)(void *); +using pfn_rpc_mem_init = void (*)(void); +using pfn_rpc_mem_deinit = void (*)(void); +using pfn_rpc_mem_alloc = void *(*) (int, uint32_t, int); +using pfn_rpc_mem_free = void (*)(void *); +using pfn_rpc_mem_to_fd = int (*)(void *); -using _pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); -using _pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); -using _pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); +using pfn_qnnsaver_initialize = decltype(QnnSaver_initialize); +using pfn_qnninterface_getproviders = decltype(QnnInterface_getProviders); +using pfn_qnnsysteminterface_getproviders = decltype(QnnSystemInterface_getProviders); #define QNN_VER_PTR(x) (&((x).v1)) #define RPCMEM_DEFAULT_FLAGS 1 @@ -702,7 +709,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) { Qnn_QuantizeParams_t src_qparam_cpy = src_qparam; Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding; - Qnn_ScaleOffset_t **scaleOffset = & axis_scale_offset.scaleOffset; + Qnn_ScaleOffset_t ** scaleOffset = & axis_scale_offset.scaleOffset; size_t scaleOffsetSize = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t); *scaleOffset = (Qnn_ScaleOffset_t *) malloc(scaleOffsetSize); memscpy(*scaleOffset, scaleOffsetSize, @@ -732,8 +739,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) { uint32_t rank = QNN_TENSOR_GET_RANK(src); QNN_TENSOR_SET_RANK(dst, rank); - size_t dim_size = rank * sizeof(uint32_t); - uint32_t *dimensions = (uint32_t *) malloc(dim_size); + size_t dim_size = rank * sizeof(uint32_t); + uint32_t * dimensions = (uint32_t *) malloc(dim_size); if (dimensions == nullptr) { QNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying " "tensor %s\n", @@ -1072,26 +1079,26 @@ class qnn_instance { QNN_LOG_DEBUG("load rpcmem lib successfully\n"); set_rpcmem_initialized(true); } - __pfn_rpc_mem_init = reinterpret_cast<_pfn_rpc_mem_init>( + _pfn_rpc_mem_init = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_init")); - __pfn_rpc_mem_deinit = reinterpret_cast<_pfn_rpc_mem_deinit>( + _pfn_rpc_mem_deinit = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_deinit")); - __pfn_rpc_mem_alloc = reinterpret_cast<_pfn_rpc_mem_alloc>( + _pfn_rpc_mem_alloc = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_alloc")); - __pfn_rpc_mem_free = reinterpret_cast<_pfn_rpc_mem_free>( + _pfn_rpc_mem_free = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_free")); - __pfn_rpc_mem_to_fd = reinterpret_cast<_pfn_rpc_mem_to_fd>( + _pfn_rpc_mem_to_fd = reinterpret_cast( dlsym(_rpc_lib_handle, "rpcmem_to_fd")); - if (nullptr == __pfn_rpc_mem_alloc || nullptr == __pfn_rpc_mem_free || - nullptr == __pfn_rpc_mem_to_fd) { + if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || + nullptr == _pfn_rpc_mem_to_fd) { QNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror()); dlclose(_rpc_lib_handle); return 9; } if (nullptr != - __pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy - __pfn_rpc_mem_init(); + _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_init(); std::vector temp_context_config; _qnn_interface.qnn_context_create( @@ -1124,7 +1131,6 @@ class qnn_instance { } _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - //TODO: faster approach to probe the accurate capacity of rpc ion memory size_t candidate_size = 0; uint8_t * rpc_buffer = nullptr; @@ -1145,6 +1151,16 @@ class qnn_instance { if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + + if (0 != init_htp_perfinfra()) { + QNN_LOG_WARN("initialize HTP performance failure"); + } + if (0 != set_rpc_polling()) { + QNN_LOG_WARN("set RPC polling failure"); + } + if (0 != set_high_performance_mode()) { + QNN_LOG_WARN("set HTP high performance mode failure"); + } } QNN_LOG_DEBUG("leave qni_init\n"); @@ -1156,9 +1172,8 @@ class qnn_instance { int ret_status = 0; Qnn_ErrorHandle_t error = QNN_SUCCESS; - if (nullptr != - __pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy - __pfn_rpc_mem_deinit(); + if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's SoC equipped low-end phone happy + _pfn_rpc_mem_deinit(); if (dlclose(_rpc_lib_handle) != 0) { QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror()); @@ -1325,6 +1340,8 @@ class qnn_instance { if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to get qnn device infra\n"); return 1; + } else { + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); } QnnHtpDevice_Infrastructure_t * htp_infra = static_cast(device_infra); @@ -1333,6 +1350,11 @@ class qnn_instance { uint32_t device_id = 0; uint32_t core_id = 0; htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid); + if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { + QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); + } else { + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + } _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; @@ -1343,14 +1365,17 @@ class qnn_instance { if (_qnn_rpc_pollingtime > 0) { QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); - rpc_pollingTime.option = - QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + rpc_pollingTime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = { - &rpc_pollingTime, nullptr}; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_ControlLatency; + memset(&rpc_ControlLatency, 0, sizeof(rpc_ControlLatency)); + rpc_ControlLatency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + rpc_ControlLatency.rpcControlLatencyConfig = 40; + + const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {&rpc_pollingTime, &rpc_ControlLatency, nullptr}; if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, - powerConfigs); + _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); } } return 0; @@ -1426,7 +1451,7 @@ class qnn_instance { } auto allocate_bytes = static_cast(bytes + alignment); - void * buf = __pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, + void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes); if (buf == nullptr) { QNN_LOG_WARN("failed to allocate rpc memory\n"); @@ -1439,7 +1464,7 @@ class qnn_instance { _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { QNN_LOG_WARN("failed to allocate rpc memory\n"); - __pfn_rpc_mem_free(buf); + _pfn_rpc_mem_free(buf); } return aligned_buf; @@ -1451,7 +1476,7 @@ class qnn_instance { } else if (0 == _rpcmem_store_map.count(buf)) { QNN_LOG_WARN("no allocated tensor\n"); } else { - __pfn_rpc_mem_free(_rpcmem_store_map[buf]); + _pfn_rpc_mem_free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); } } @@ -1461,7 +1486,7 @@ class qnn_instance { if (!is_rpcmem_initialized()) { QNN_LOG_WARN("rpc memory not initialized\n"); } else { - mem_fd = __pfn_rpc_mem_to_fd(buf); + mem_fd = _pfn_rpc_mem_to_fd(buf); } return mem_fd; @@ -1560,7 +1585,7 @@ class qnn_instance { } auto * get_providers = - reinterpret_cast<_pfn_qnnsysteminterface_getproviders *>( + reinterpret_cast( dlsym(_system_lib_handle, "QnnSystemInterface_getProviders")); if (nullptr == get_providers) { QNN_LOG_WARN( @@ -1661,7 +1686,7 @@ class qnn_instance { return 1; } - auto get_providers = load_qnn_functionpointers<_pfn_qnninterface_getproviders *>( + auto get_providers = load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", @@ -1805,11 +1830,11 @@ class qnn_instance { void * _rpc_lib_handle = nullptr; std::atomic_bool _rpcmem_initialized{false}; - _pfn_rpc_mem_alloc __pfn_rpc_mem_alloc; - _pfn_rpc_mem_free __pfn_rpc_mem_free; - _pfn_rpc_mem_to_fd __pfn_rpc_mem_to_fd; - _pfn_rpc_mem_init __pfn_rpc_mem_init; - _pfn_rpc_mem_deinit __pfn_rpc_mem_deinit; + pfn_rpc_mem_alloc _pfn_rpc_mem_alloc; + pfn_rpc_mem_free _pfn_rpc_mem_free; + pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd; + pfn_rpc_mem_init _pfn_rpc_mem_init; + pfn_rpc_mem_deinit _pfn_rpc_mem_deinit; std::unordered_map _rpcmem_store_map; size_t _rpcmem_capacity = 512; @@ -1824,101 +1849,63 @@ class qnn_instance { static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor, bool b_dump_tensor_info) { - // only support the following 3 OPs currently - // provide a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends - // which the backend's ggml_backend_xxx_buffer_is_host return true. - // this approach could be found: - // https://github.com/ggerganov/llama.cpp/pull/7641 - bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) - || (tensor->op == GGML_OP_MUL_MAT)); - if (!supported_op) { + if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || + tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || + tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { return false; } const struct ggml_tensor * src0 = tensor->src[0]; const struct ggml_tensor * src1 = tensor->src[1]; + if (nullptr == src0 || nullptr == src1) { + return false; + } + const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne20 = tensor->ne[0]; - const int64_t ne21 = tensor->ne[1]; - - //TODO: support other quantized data type - if (ggml_is_quantized(src0->type)) { - if ((src0->type != GGML_TYPE_Q8_0) && (src0->type != GGML_TYPE_Q4_0)) { - return false; - } - } - - if (b_dump_tensor_info) { - if (tensor->op == GGML_OP_MUL_MAT) { - QNN_LOG_DEBUG("GGML_OP_MUL_MAT"); - QNN_LOG_DEBUG("op name:%s, tensor type:%s", - ggml_op_name(tensor->op), - ggml_type_name(tensor->type)); - QNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type)); - QNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type)); - QNN_LOG_DEBUG("src0 %15s: type = %i (%5s) ne = %5" PRIi64 - " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("src1 %15s: type = %i (%5s) ne = %5" PRIi64 - " x %5" PRIi64 " x %5" PRIi64 - ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG( - " %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - tensor->name, tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], - tensor->nb[1], tensor->nb[2]); - } - } - if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || - tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || - tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) { + // make qnn_get_ggml_tensor_rank and QNN SDK happy + if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { return false; } - // make qnn_get_ggml_tensor_rank and QNN SDK happy - if ((ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1)) { + // TODO: support other GGML OPs using QNN API + // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend + // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends + // which the backend's ggml_backend_xxx_buffer_is_host return true. + // this approach could be found: + // https://github.com/ggerganov/llama.cpp/pull/7641 + bool supported_op = false; + supported_op = (tensor->op == GGML_OP_ADD); + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + if (!supported_op) { return false; } - int qtype = src0->type; - if (tensor->op == GGML_OP_ADD) { - return (qtype == GGML_TYPE_F32 || qtype == GGML_TYPE_F16 || - qtype == GGML_TYPE_Q8_0) && - (src1->type == GGML_TYPE_F32); + //TODO: support other quantized data type + if (ggml_is_quantized(src0->type)) { + if (src0->type != GGML_TYPE_Q8_0 && src0->type != GGML_TYPE_Q4_0) { + return false; + } } + int qtype = src0->type; if (tensor->op == GGML_OP_MUL) { return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); } if (tensor->op == GGML_OP_MUL_MAT) { - if (ctx->device == QNN_BACKEND_GGML) { - return (ne00 == ne10) && (src1->ne[2] % src0->ne[2] == 0) && - (src1->ne[3] % src0->ne[3] == 0); - } - if ((ctx->device == QNN_BACKEND_NPU) && (qtype == GGML_TYPE_Q8_0) && - (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32)) { + if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { + return false; + } else { return true; } - if (ctx->device == QNN_BACKEND_CPU || ctx->device == QNN_BACKEND_GPU) { - return (ne00 == ne10) && (ne00 == ne01); - } - return false; } -} + return true; +} static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -1978,10 +1965,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + QNN_LOG_INFO("graph name %s", graph_name.c_str()); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t custom_config; + custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + custom_config.numHvxThreads = 8; + + QnnGraph_Config_t graph_config; + graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_config.customConfig = &custom_config; + const QnnGraph_Config_t * p_graphconfig[] = {&graph_config, NULL}; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } + if (QNN_SUCCESS != error) { QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", @@ -2112,8 +2114,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -2198,7 +2198,7 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + src0->name + "_" + src1->name; - QNN_LOG_DEBUG("graph name %s", graph_name.c_str()); + QNN_LOG_INFO("graph name %s", graph_name.c_str()); error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle); @@ -2331,8 +2331,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -2894,7 +2892,6 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t Qnn_ErrorHandle_t error = QNN_SUCCESS; ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; - static int idx = 0; char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); @@ -3061,7 +3058,7 @@ GGML_CALL static const char * ggml_backend_qnn_name(ggml_backend_t backend) { GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { QNN_LOG_INFO("enter %s", __func__); ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - QNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); + QNN_LOG_INFO("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name); qnn_instance * instance = (qnn_instance *)g_qnn_mgr[ctx->device].instance; if (instance != nullptr) { @@ -3073,7 +3070,7 @@ GGML_CALL static void ggml_backend_qnn_free(ggml_backend_t backend) { auto & graph_item = graph_it->second; Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item); GGML_UNUSED(graph_handle); - QNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str()); + QNN_LOG_INFO("graph type:%s", graph_it->first.c_str()); } instance->_qnn_graph_map.clear(); @@ -3104,7 +3101,7 @@ GGML_CALL static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backe params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *node = cgraph->nodes[i]; + ggml_tensor * node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { @@ -3213,7 +3210,6 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } - static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index 192f2f4bda2f5..4c21be5a41fa2 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -12,6 +12,8 @@ ANDROID_PLATFORM=android-34 GGML_QNN_UT=ggml-qnn-ut REMOTE_PATH=/data/local/tmp/ +BUILDTYPE=Debug +BUILDTYPE=Release function dump_vars() @@ -70,7 +72,7 @@ function check_and_download_ndk() function build_arm64 { - cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} + cmake -H. -B./out/arm64-v8a -DTARGET_NAME=${GGML_QNN_UT} -DCMAKE_BUILD_TYPE=${BUILDTYPE} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH} cd ./out/arm64-v8a make @@ -166,9 +168,9 @@ function show_usage() echo "Usage:" echo " $0 build (build Android command line UT program)" echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" - echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" - echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" - echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU)" + echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" + echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" + echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo -e "\n\n\n" } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index eb072beae6bd4..9af433ceb6690 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -72,14 +72,12 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line); int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args); if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) { - //for Android command line application or WoA printf("%s\n", s_ggml_qnn_log_internal_buf); } va_end(args); } } - static const char * get_qnn_backend_name(int n_backend_type) { switch (n_backend_type) { case 0: @@ -95,7 +93,6 @@ static const char * get_qnn_backend_name(int n_backend_type) { } } - static bool ggml_graph_compute_helper( struct ggml_backend * backend, struct ggml_cgraph * graph, @@ -123,26 +120,25 @@ static bool ggml_graph_compute_helper( } #endif - //a new approch of mixed inference if (nullptr != backend) return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; else return ggml_graph_compute(graph, &plan); } - #define QK8_0 32 + typedef struct { uint16_t d; // delta int8_t qs[QK8_0]; // quants } block_q8_0; - static inline float ggml_compute_fp16_to_fp32(uint16_t h) { __fp16 tmp; memcpy(&tmp, &h, sizeof(uint16_t)); return (float)tmp; } + #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) static void tensor_dump(const ggml_tensor * tensor, const char * name) { @@ -245,7 +241,6 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) { } } - static uint32_t get_tensor_rank(const ggml_tensor * tensor) { uint32_t rank = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -256,7 +251,6 @@ static uint32_t get_tensor_rank(const ggml_tensor * tensor) { return rank; } - static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]); size_t n_dims = get_tensor_rank(tensor); @@ -270,7 +264,6 @@ static uint32_t get_tensor_data_size(const ggml_tensor * tensor) { return ggml_nbytes(tensor); } - //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) { // static RNG initialization (revisit if n_threads stops being constant) @@ -305,8 +298,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m t.join(); } if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { - //ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), size * sizeof(float)); +#else + ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); +#endif } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); @@ -321,18 +317,23 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size())); - //ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, dataq.data(), dataq.size()); +#else + ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); +#endif } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. - //ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); +#ifdef GGML_USE_QNN memcpy((char*)tensor->data, data.data(), ggml_nbytes(tensor)); +#else + ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); +#endif } else { GGML_ASSERT(false); } } - //ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310 static void initialize_tensors(ggml_context * ctx) { for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { @@ -340,19 +341,17 @@ static void initialize_tensors(ggml_context * ctx) { } } - static void show_usage() { printf(" " \ "\nUsage: test_qnn_ops [options]\n" \ "\n" \ "Options:\n" \ " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ - " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU)\n" \ + " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ " ?/h print usage infomation\n\n" ); } - static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { int64_t n_begin_time = 0LL; int64_t n_end_time = 0LL; @@ -369,16 +368,15 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_backend_t backend = nullptr; ggml_backend_buffer_t buffer= nullptr; - ggml_type qtype = GGML_TYPE_I8; - qtype = GGML_TYPE_F32; + ggml_type qtype = GGML_TYPE_I8; qtype = GGML_TYPE_F16; qtype = GGML_TYPE_Q8_0; + qtype = GGML_TYPE_F32; std::vector work_buffer; QNN_LOG_DEBUG("enter qnn_ggml_op\n"); QNN_LOG_DEBUG("ggml op:%d(%s)\n", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); - n_begin_time = ggml_time_us(); srand(time(NULL)); @@ -473,7 +471,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { initialize_tensors(ctx); } ggml_set_f32(src1, (rand() % 100 + 1)); - //ggml_set_f32(dst, 0.0f); } ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr); @@ -501,13 +498,13 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { ggml_free(ctx); ggml_backend_buffer_free(buffer); ggml_backend_free(backend); + n_end_time = ggml_time_us(); n_duration = (n_end_time - n_begin_time) / 1000; QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), get_qnn_backend_name(n_backend_type), n_duration); return 0; } - int main(int argc, char * argv[]) { int num_threads = 4; int n_backend_type = QNN_BACKEND_CPU; @@ -531,7 +528,7 @@ int main(int argc, char * argv[]) { } else if (0 == strcmp(argv[i], "-b")) { if (i + 1 < argc) { int backend = atoi(argv[i + 1]); - if (backend <= QNN_BACKEND_NPU) + if (backend <= QNN_BACKEND_GGML) n_backend_type = backend; else { show_usage(); @@ -549,5 +546,6 @@ int main(int argc, char * argv[]) { QNN_LOG_DEBUG("backend %d, ggml op:%d(%s)", n_backend_type, n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); qnn_op_ut(num_threads, n_backend_type, n_ggml_op_type); + return 0; } From faaa86b7e4925c0ea38480cc1b88e1a52097e221 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Wed, 12 Jun 2024 16:30:50 +0800 Subject: [PATCH 15/16] ggml-qnn: refine ggml inference using QNN NPU --- ggml-qnn.cpp | 668 ++++++++++++++++++++++++--------- tests/ggml-qnn/CMakeLists.txt | 8 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 3 +- 3 files changed, 507 insertions(+), 172 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index 4700e145112d6..f59c54fcacd97 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -1001,12 +1001,10 @@ class qnn_instance { _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]); - _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, - &_qnn_log_handle); + _qnn_interface.qnn_log_create(qnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (nullptr == _qnn_log_handle) { - QNN_LOG_WARN( - "why failed to initialize qnn log\n"); // NPU backend not work on - // Qualcomm SoC equipped low-end phone + // NPU backend not work on Qualcomm SoC equipped low-end phone + QNN_LOG_WARN("why failed to initialize qnn log\n"); return 4; } else { QNN_LOG_DEBUG("initialize qnn log successfully\n"); @@ -1025,23 +1023,62 @@ class qnn_instance { } if (nullptr != _qnn_raw_interface.propertyHasCapability) { - auto qnnStatus = + Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { + if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { QNN_LOG_WARN("device property is not supported\n"); } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { + if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { QNN_LOG_WARN("device property is not known to backend\n"); } } - Qnn_ErrorHandle_t qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, - &_qnn_device_handle); + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + if (_backend_name.find("Htp") != std::variant_npos) { + const QnnDevice_PlatformInfo_t * p_info = nullptr; + _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); + QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); + QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; + QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = { }; + for (int i = 0; i < p_info->v1.numHwDevices; i++) { + QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, + infos[i].v1.deviceType, infos[i].v1.numCores); + QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; + chipinfo = devinfo->onChipDevice; + QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; + QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); + QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ + chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ + htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); + g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; + } + _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); + + QnnHtpDevice_CustomConfig_t soc_customconfig; + soc_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC; + soc_customconfig.socModel = chipinfo.socModel; + QnnDevice_Config_t soc_devconfig; + soc_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + soc_devconfig.customConfig = &soc_customconfig; + + QnnHtpDevice_CustomConfig_t arch_customconfig; + arch_customconfig.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH; + arch_customconfig.arch.arch = chipinfo.arch; + arch_customconfig.arch.deviceId = 0; // Id of device to be used. If single device is used by default 0. + QnnDevice_Config_t arch_devconfig; + arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; + arch_devconfig.customConfig = &arch_customconfig; + + const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL}; + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); + } else { + qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); + } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { QNN_LOG_WARN("failed to create QNN device\n"); } else { - QNN_LOG_INFO("create device successfully\n"); + QNN_LOG_INFO("create QNN device successfully\n"); } if (qnn_sdk_profile_level::profile_off != _profile_level) { @@ -1096,9 +1133,9 @@ class qnn_instance { return 9; } - if (nullptr != - _pfn_rpc_mem_init) // make Qualcomm's SoC equipped low-end phone happy + if (nullptr != _pfn_rpc_mem_init) { // make Qualcomm's SoC equipped low-end phone happy _pfn_rpc_mem_init(); + } std::vector temp_context_config; _qnn_interface.qnn_context_create( @@ -1113,32 +1150,14 @@ class qnn_instance { } if (_backend_name.find("Htp") != std::variant_npos) { - const QnnDevice_PlatformInfo_t * p_info = nullptr; - _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info); - QNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices); - QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices; - for (int i = 0; i < p_info->v1.numHwDevices; i++) { - QNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId, - infos[i].v1.deviceType, infos[i].v1.numCores); - QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension; - QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice; - QnnHtpDevice_Arch_t htp_arch = chipinfo.arch; - QNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType, (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : ""); - QNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \ - chipinfo.socModel, qnn_get_chipset_desc(chipinfo.socModel), \ - htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize); - g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize }; - } - _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info); - //TODO: faster approach to probe the accurate capacity of rpc ion memory size_t candidate_size = 0; uint8_t * rpc_buffer = nullptr; - const int SIZE_IN_MB = (1 << 20); + const int size_in_mb = (1 << 20); size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4)); + rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); if (nullptr == rpc_buffer) { QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); break; @@ -1150,7 +1169,7 @@ class qnn_instance { } if (candidate_size > _rpcmem_capacity) _rpcmem_capacity = candidate_size; - QNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); if (0 != init_htp_perfinfra()) { QNN_LOG_WARN("initialize HTP performance failure"); @@ -1181,6 +1200,10 @@ class qnn_instance { QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); } + if (_backend_name.find("Htp") != std::variant_npos) { + _qnn_htp_perfinfra->destroyPowerConfigId(_qnn_power_configid); + } + if (nullptr != _qnn_context_handle) { error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle); @@ -1239,6 +1262,9 @@ class qnn_instance { return ret_status; } + //keep it for further usage of offload the entire cgraph to a single QNN DAG directly + //which was used in Qualcomm's dedicated AI technology +#if 0 int init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation = true, const QnnGraph_Config_t ** graph_configs = nullptr) { @@ -1288,6 +1314,7 @@ class qnn_instance { return 0; } +#endif const qnn_interface & get_qnn_interface() { if (!_qnn_interface.is_loaded()) { @@ -1362,70 +1389,86 @@ class qnn_instance { } int set_rpc_polling() { - if (_qnn_rpc_pollingtime > 0) { - QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingTime; - memset(&rpc_pollingTime, 0, sizeof(rpc_pollingTime)); - rpc_pollingTime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; - rpc_pollingTime.rpcPollingTimeConfig = _qnn_rpc_pollingtime; - - QnnHtpPerfInfrastructure_PowerConfig_t rpc_ControlLatency; - memset(&rpc_ControlLatency, 0, sizeof(rpc_ControlLatency)); - rpc_ControlLatency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; - rpc_ControlLatency.rpcControlLatencyConfig = 40; - - const QnnHtpPerfInfrastructure_PowerConfig_t * powerConfigs[] = {&rpc_pollingTime, &rpc_ControlLatency, nullptr}; - if (_qnn_htp_perfinfra) { - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + if (_qnn_htp_perfinfra) { + QnnHtpPerfInfrastructure_PowerConfig_t rpc_polling_time; + memset(&rpc_polling_time, 0, sizeof(rpc_polling_time)); + rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + //use rpc polling time recommended 0-10000 us + rpc_polling_time.rpcPollingTimeConfig = 9999; + + QnnHtpPerfInfrastructure_PowerConfig_t rpc_control_latency; + memset(&rpc_control_latency, 0, sizeof(rpc_control_latency)); + rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + //use rpc control latency recommended 100 us, refer hexagon sdk + rpc_control_latency.rpcControlLatencyConfig = 100; + + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { + &rpc_polling_time, + &rpc_control_latency, + nullptr}; + Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig( + _qnn_power_configid, + power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp perf failed\n"); + } else { + QNN_LOG_INFO("set htp perf ok\n"); } + } else { + QNN_LOG_WARN("can't set htp perf\n"); } + return 0; } int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_DEBUG("perf intra is null\n"); + QNN_LOG_WARN("perf intra is null\n"); return 1; } - QnnHtpPerfInfrastructure_PowerConfig_t powerConfig; - memset(&powerConfig, 0, sizeof(powerConfig)); - powerConfig.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - powerConfig.dcvsV3Config.dcvsEnable = 0; - powerConfig.dcvsV3Config.setDcvsEnable = 1; - powerConfig.dcvsV3Config.contextId = _qnn_power_configid; - powerConfig.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; - powerConfig.dcvsV3Config.setSleepLatency = - 1; // true to consider Latency parameter otherwise False - powerConfig.dcvsV3Config.setBusParams = - 1; // true to consider Bus parameter otherwise False - powerConfig.dcvsV3Config.setCoreParams = - 1; // true to consider Core parameter otherwise False - powerConfig.dcvsV3Config.sleepDisable = - 0; // true to consider sleep/LPM modes, False to enable - powerConfig.dcvsV3Config.setSleepDisable = - 0; // true to consider sleep disable/enable parameter otherwise False set sleep latency parameter - uint32_t latencyValue = 40; - powerConfig.dcvsV3Config.sleepLatency = - latencyValue; // range 40-2000 micro sec + QnnHtpPerfInfrastructure_PowerConfig_t power_config; + memset(&power_config, 0, sizeof(power_config)); + power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; + power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.contextId = _qnn_power_configid; + power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; + power_config.dcvsV3Config.setSleepLatency = + 1; // true to consider Latency parameter otherwise false + power_config.dcvsV3Config.sleepLatency = 10; + power_config.dcvsV3Config.setBusParams = + 1; // true to consider Bus parameter otherwise false + power_config.dcvsV3Config.setCoreParams = + 1; // true to consider Core parameter otherwise false + power_config.dcvsV3Config.sleepDisable = + 1; // true to consider sleep/LPM modes, false to enable + power_config.dcvsV3Config.setSleepDisable = + 1; // true to consider sleep disable/enable parameter otherwise false set sleep latency parameter // set Bus Clock Parameters - powerConfig.dcvsV3Config.busVoltageCornerMin = + power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerTarget = + power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.busVoltageCornerMax = + power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set Core Clock Parameters - powerConfig.dcvsV3Config.coreVoltageCornerMin = + power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerTarget = + power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; - powerConfig.dcvsV3Config.coreVoltageCornerMax = + power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; // set power config with different performance parameters - const QnnHtpPerfInfrastructure_PowerConfig_t *powerConfigs[] = { - &powerConfig, nullptr}; - - _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, powerConfigs); + const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { + &power_config, nullptr}; + Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; + qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); + if (qnn_status != QNN_SUCCESS) { + QNN_LOG_WARN("set htp high performance mode failed\n"); + } else { + QNN_LOG_INFO("set htp high performance mode ok\n"); + } return 0; } @@ -1505,7 +1548,7 @@ class qnn_instance { if (is_rpcmem_allocated(p_data)) { QNN_LOG_WARN("rpc memory already allocated\n"); - // return 3; + return 3; } if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { QNN_LOG_WARN("tensor %s has been registered shared memory\n", @@ -1518,7 +1561,7 @@ class qnn_instance { QNN_LOG_WARN("failed to get file descriptor\n"); return 5; } - QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + QNN_LOG_INFO("mem_fd %d\n", mem_fd); Qnn_MemDescriptor_t descriptor = {{QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr}, @@ -1538,11 +1581,24 @@ class qnn_instance { (QNN_VER_PTR(*p_tensor)->name)); } QNN_VER_PTR(*p_tensor)->memHandle = handle; - _qnn_mem_set.insert(handle); + _qnn_mem_set.insert((std::pair(p_data, handle))); return 0; } + void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; + if (it->second == mem_handle) { + return it->first; + } + } + QNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle); + return nullptr; + } + void unregister_rpcmem() { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1550,7 +1606,10 @@ class qnn_instance { QNN_LOG_WARN("no rpcmem registered\n"); } - for (auto & mem_handle : _qnn_mem_set) { + for (std::unordered_map::iterator it = _qnn_mem_set.begin(); + it != _qnn_mem_set.end(); + it++) { + Qnn_MemHandle_t mem_handle = it->second; error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { QNN_LOG_WARN("failed to unregister shared memory, error %d\n", @@ -1561,7 +1620,7 @@ class qnn_instance { } bool is_rpcmem_allocated(void * buf) { - return _rpcmem_store_map.count(buf) != 0U; + return _qnn_mem_set.count(buf) != 0U; } @@ -1686,8 +1745,9 @@ class qnn_instance { return 1; } - auto get_providers = load_qnn_functionpointers( - lib_handle, "QnnInterface_getProviders"); + auto get_providers = + load_qnn_functionpointers( + lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); @@ -1786,7 +1846,7 @@ class qnn_instance { private: std::string _lib_path; std::string _backend_name; - std::string _model_name; // prebuilt QNN model name, not used currently + std::string _model_name; // Qualcomm's dedicated prebuilt model name, keep it for further usage BackendIdType _backend_id; bool _debug_tensor = false; @@ -1816,12 +1876,11 @@ class qnn_instance { QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr; uint32_t _qnn_power_configid = 1; - uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing QNN_INTERFACE_VER_TYPE _qnn_raw_interface; QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface; - std::unordered_set _qnn_mem_set; + std::unordered_map _qnn_mem_set; std::mutex _init_mutex; std::unordered_map _loaded_lib_handle; @@ -1898,9 +1957,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, if (tensor->op == GGML_OP_MUL_MAT) { if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - return false; - } else { - return true; + //make mul_mat with QNN RPC happy + //return false; } } @@ -1964,17 +2022,29 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + - src0->name + "_" + src1->name; + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); if (ctx->device == QNN_BACKEND_NPU) { - QnnHtpGraph_CustomConfig_t custom_config; - custom_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; - custom_config.numHvxThreads = 8; - - QnnGraph_Config_t graph_config; - graph_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; - graph_config.customConfig = &custom_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_config, NULL}; + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + /* + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC + */ + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -1989,7 +2059,21 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src "error = %d\n", graph_name.c_str(), error); goto failure; + } else { + QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); } + + if (ctx->device == QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2006,13 +2090,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src goto failure; } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2023,6 +2100,46 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = nullptr; + uint8_t * qnn_buffer_1 = nullptr; + uint8_t * qnn_buffer_2 = nullptr; + qnn_instance * instance = ctx->instance; + + qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + if (nullptr == qnn_buffer_0) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_0, tensor_0); + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + if (nullptr == qnn_buffer_1) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_1, tensor_1); + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + + qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + if (nullptr == qnn_buffer_2) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_2, tensor_2); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { @@ -2048,6 +2165,12 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { @@ -2067,13 +2190,6 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2084,6 +2200,25 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_0)->memHandle)); + if (nullptr != qnn_buffer_0) + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_1)->memHandle)); + if (nullptr != qnn_buffer_1) + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, @@ -2093,7 +2228,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + if (nullptr != qnn_buffer_2) + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } } + failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); @@ -2197,17 +2340,55 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + - src0->name + "_" + src1->name; + "_" + src0->name + "_" + src1->name; QNN_LOG_INFO("graph name %s", graph_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, - &graph_handle); + if (ctx->device == QNN_BACKEND_NPU) { + QnnHtpGraph_CustomConfig_t hvx_config; + hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS; + hvx_config.numHvxThreads = 8; + QnnGraph_Config_t graph_hvx_config; + graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_hvx_config.customConfig = &hvx_config; + + QnnHtpGraph_CustomConfig_t dlbc_config; + dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; + /* + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; + dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC + */ + dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + + QnnGraph_Config_t graph_dlbc_config; + graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_dlbc_config.customConfig = &dlbc_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, + &graph_handle); + } else { + error = qnn_raw_interface.graphCreate( + instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, + &graph_handle); + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("can't create qnn graph handle with graph name %s, " "error = %d\n", graph_name.c_str(), error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2224,13 +2405,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, goto failure; } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2241,6 +2415,46 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = nullptr; + uint8_t * qnn_buffer_1 = nullptr; + uint8_t * qnn_buffer_2 = nullptr; + qnn_instance * instance = ctx->instance; + + qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + if (nullptr == qnn_buffer_0) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_0, tensor_0); + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + if (nullptr == qnn_buffer_1) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_1, tensor_1); + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + + qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + if (nullptr == qnn_buffer_2) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_2, tensor_2); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, @@ -2266,6 +2480,13 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { @@ -2294,12 +2515,24 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_0)->memHandle)); + if (nullptr != qnn_buffer_0) + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_1)->memHandle)); + if (nullptr != qnn_buffer_1) + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; @@ -2311,7 +2544,15 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + if (nullptr != qnn_buffer_2) + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } } + failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); @@ -2428,6 +2669,17 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, goto failure; } + if (ctx->device == QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; + + QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; + QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; + } + error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); @@ -2444,13 +2696,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, goto failure; } - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; @@ -2461,6 +2706,46 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = nullptr; + uint8_t * qnn_buffer_1 = nullptr; + uint8_t * qnn_buffer_2 = nullptr; + qnn_instance * instance = ctx->instance; + + qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + if (nullptr == qnn_buffer_0) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_0, tensor_0); + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + if (nullptr == qnn_buffer_1) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_1, tensor_1); + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + + qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + if (nullptr == qnn_buffer_2) { + QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + } else { + QNN_LOG_INFO("alloc rpcmem successfully\n"); + } + instance->register_rpcmem(qnn_buffer_2, tensor_2); + } + Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, @@ -2486,6 +2771,13 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } + auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); instance->_qnn_graph_map[map_entry] = graph_item; } else { @@ -2514,17 +2806,28 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; + if (ctx->device != QNN_BACKEND_NPU) { + QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, + qnn_get_ggml_tensor_data_size(src0)}; + QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, + qnn_get_ggml_tensor_data_size(src1)}; + QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, + qnn_get_ggml_tensor_data_size(dst)}; + } else { + uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_0)->memHandle)); + if (nullptr != qnn_buffer_0) + memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); + + uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_1)->memHandle)); + if (nullptr != qnn_buffer_1) + memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); + } Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = - qnn_raw_interface.graphExecute(graph_handle, + error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); @@ -2532,7 +2835,15 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, QNN_LOG_INFO("error = %d\n", error); goto failure; } + + if (ctx->device == QNN_BACKEND_NPU) { + uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( + QNN_VER_PTR(*tensor_2)->memHandle)); + if (nullptr != qnn_buffer_2) + memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); + } } + failure: if (QNN_SUCCESS != error) { QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); @@ -2889,9 +3200,9 @@ GGML_CALL static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t b GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - ggml_backend_qnn_buffer_context * ctx = - (ggml_backend_qnn_buffer_context *) buffer->context; + Qnn_ErrorHandle_t error = QNN_SUCCESS; + ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *) buffer->context; + static int idx = 0; char tensor_name[GGML_MAX_NAME] = {0}; snprintf(tensor_name, GGML_MAX_NAME, "tensor_%04d", idx++); @@ -2908,22 +3219,43 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; } - Qnn_Tensor_t qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT; + + if (ctx->device != QNN_BACKEND_GPU) { + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + } else { + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = QNN_TENSORMEMTYPE_MEMHANDLE, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + } Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { @@ -2933,7 +3265,7 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t error = deep_copy_qnn_tensors(qnn_tensor, *p_qnn_tensor); if (error != QNN_SUCCESS) { free(p_qnn_tensor); - QNN_LOG_DEBUG("init tensor failed"); + QNN_LOG_WARN("init tensor failed"); return; } tensor->extra = p_qnn_tensor; @@ -3210,6 +3542,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device) { device, GGML_QNN_MAX_DEVICES - 1); return nullptr; } + static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; static bool ggml_backend_qnn_buffer_type_initialized = false; @@ -3307,7 +3640,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) { std::string device_name = qnn_get_backend_name(device); QNN_LOG_INFO("qnn device name %s", device_name.c_str()); - instance->init_qnn_graph(device_name.c_str(), false); g_qnn_mgr[device].instance = instance; g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface(); g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface(); diff --git a/tests/ggml-qnn/CMakeLists.txt b/tests/ggml-qnn/CMakeLists.txt index a78bdaeaf8009..bf061e6c7c3a1 100644 --- a/tests/ggml-qnn/CMakeLists.txt +++ b/tests/ggml-qnn/CMakeLists.txt @@ -6,8 +6,8 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_POSITION_INDEPENDENT_CODE ON) -#set to ON if target Android phone is based on Qualcomm Snapdragon 8 Gen 3 -set(TARGET_SNAPDRAGON_8_GEN3 OFF) +#set to OFF if target Android phone is not equipped with Qualcomm Snapdragon 8 Gen 3 +set(TARGET_SNAPDRAGON_8_GEN3 ON) set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN) set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android) @@ -35,6 +35,8 @@ add_definitions(-DGGML_USE_QNN) if(CMAKE_BUILD_TYPE STREQUAL "Release") add_definitions(-DNDEBUG) add_definitions(-O3) +else() +add_definitions(-O3) endif() if (TARGET_SNAPDRAGON_8_GEN3) @@ -44,7 +46,7 @@ add_definitions(-mcpu=cortex-x1) add_definitions(-mtune=cortex-x1) else() -# the below build optimization might be works well on ALL mainstream Android phone based on Qualcomm mobile SoC +# the below build optimization might be works well on ALL Android phone equipped with Qualcomm mainstream mobile SoC add_definitions(-mcpu=cortex-a72) endif() diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 9af433ceb6690..0abfc62073f08 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -415,7 +415,8 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { sizex = ggml_blck_size(qtype) * 2; } } - QNN_LOG_DEBUG("sizex %d\n", sizex); + QNN_LOG_DEBUG("sizex: %d\n", sizex); + QNN_LOG_DEBUG("sizey: %d\n", sizey); if (n_ggml_op_type == GGML_OP_MUL) { src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); From 5598fbd15dfd7e0483ca544c4c8a86aca6c79ea2 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Thu, 13 Jun 2024 15:41:53 +0800 Subject: [PATCH 16/16] review: make a MVP(Minimum Viable PR) style PR in upstream --- ggml-qnn.cpp | 597 +++++++----------------- tests/ggml-qnn/ggml-qnn-ut-build-run.sh | 10 +- tests/ggml-qnn/ggml-qnn-ut.cpp | 17 +- 3 files changed, 183 insertions(+), 441 deletions(-) diff --git a/ggml-qnn.cpp b/ggml-qnn.cpp index f59c54fcacd97..f268c7f0e825a 100644 --- a/ggml-qnn.cpp +++ b/ggml-qnn.cpp @@ -55,7 +55,7 @@ #include "Saver/QnnSaver.h" #include "System/QnnSystemInterface.h" #include "HTP/QnnHtpDevice.h" -#include +#include "HTP/QnnHtpGraph.h" // ================================================================================================= // @@ -91,12 +91,6 @@ typedef void (*ggml_qnn_func_t)(ggml_backend_qnn_context * ctx, const ggml_tensor * src1, ggml_tensor * dst); -typedef void (*ggml_qnn_func_common_t)(ggml_backend_qnn_context * ctx, - const ggml_op ggml_op, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst); - enum qcom_htp_arch { NONE = 0, V68 = 68, @@ -424,6 +418,7 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso return true; } +#ifndef NDEBUG #define CHECK_PARAMS(ctx, src0, src1, dst) \ do { \ if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \ @@ -431,6 +426,10 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso } \ } while (0) +#else +#define CHECK_PARAMS(ctx, src0, src1, dst) +#endif + #if ENABLE_QNNBACKEND_PERF class qnn_perf { public: @@ -446,7 +445,7 @@ class qnn_perf { void info() { _end_time = ggml_time_us(); _duration = (_end_time - _begin_time); - QNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); + QNN_LOG_INFO("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration); } private: @@ -809,7 +808,7 @@ static void qnn_sdk_logcallback(const char * fmt, QnnLog_Level_t level, memset(s_ggml_qnn_logbuf, 0, QNN_LOGBUF_LEN); vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), QNN_LOGBUF_LEN, fmt, argp); - QNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); + QNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf); } #endif } @@ -1069,7 +1068,7 @@ class qnn_instance { arch_devconfig.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM; arch_devconfig.customConfig = &arch_customconfig; - const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, NULL}; + const QnnDevice_Config_t * p_deviceconfig[] = {&soc_devconfig, &arch_devconfig, nullptr}; qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle); } else { qnn_status = _qnn_raw_interface.deviceCreate(_qnn_log_handle, nullptr, &_qnn_device_handle); @@ -1137,10 +1136,14 @@ class qnn_instance { _pfn_rpc_mem_init(); } - std::vector temp_context_config; + /* TODO: not used, keep it for further usage + QnnContext_Config_t qnn_context_config = QNN_CONTEXT_CONFIG_INIT; + qnn_context_config.priority = QNN_PRIORITY_DEFAULT; + const QnnContext_Config_t * context_configs[] = {&qnn_context_config, nullptr}; + */ _qnn_interface.qnn_context_create( _qnn_backend_handle, _qnn_device_handle, - temp_context_config.empty() ? nullptr : temp_context_config.data(), + nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { QNN_LOG_WARN("why failed to initialize qnn context\n"); @@ -1157,9 +1160,11 @@ class qnn_instance { size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048}; size_t probe_counts = sizeof(probe_slots) / sizeof(size_t); for (size_t idx = 0; idx < probe_counts; idx++) { - rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, 4)); + rpc_buffer = static_cast(alloc_rpcmem( + probe_slots[idx] * size_in_mb, 4)); if (nullptr == rpc_buffer) { - QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + QNN_LOG_INFO("alloc rpcmem %d (MB) failure, %s\n", + probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -1262,8 +1267,8 @@ class qnn_instance { return ret_status; } - //keep it for further usage of offload the entire cgraph to a single QNN DAG directly - //which was used in Qualcomm's dedicated AI technology + //TODO:keep it for further usage of offload the entire cgraph to a single QNN DAG directly + // which was used in Qualcomm's dedicated AI technology #if 0 int init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation = true, @@ -1430,13 +1435,14 @@ class qnn_instance { QnnHtpPerfInfrastructure_PowerConfig_t power_config; memset(&power_config, 0, sizeof(power_config)); power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; - power_config.dcvsV3Config.dcvsEnable = 0; + power_config.dcvsV3Config.setDcvsEnable = 1; + power_config.dcvsV3Config.dcvsEnable = 0; power_config.dcvsV3Config.contextId = _qnn_power_configid; power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; power_config.dcvsV3Config.setSleepLatency = 1; // true to consider Latency parameter otherwise false - power_config.dcvsV3Config.sleepLatency = 10; + power_config.dcvsV3Config.sleepLatency = 40; power_config.dcvsV3Config.setBusParams = 1; // true to consider Bus parameter otherwise false power_config.dcvsV3Config.setCoreParams = @@ -1459,6 +1465,7 @@ class qnn_instance { DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER; + // set power config with different performance parameters const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = { &power_config, nullptr}; @@ -1550,6 +1557,7 @@ class qnn_instance { QNN_LOG_WARN("rpc memory already allocated\n"); return 3; } + if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) { QNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name)); @@ -1710,7 +1718,7 @@ class qnn_instance { int result = 0; if (nullptr == _system_lib_handle) { - QNN_LOG_DEBUG("system lib handle is null\n"); + QNN_LOG_WARN("system lib handle is null\n"); return 1; } @@ -1724,8 +1732,7 @@ class qnn_instance { int dlclose_error = dlclose(_system_lib_handle); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", - dlerror()); + QNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror()); return 2; } @@ -1740,8 +1747,7 @@ class qnn_instance { void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL); if (nullptr == lib_handle) { - QNN_LOG_WARN("can not open QNN library %s, with error: %s", - lib_path.c_str(), dlerror()); + QNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror()); return 1; } @@ -1749,8 +1755,7 @@ class qnn_instance { load_qnn_functionpointers( lib_handle, "QnnInterface_getProviders"); if (nullptr == get_providers) { - QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", - dlerror()); + QNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror()); return 2; } @@ -1758,14 +1763,12 @@ class qnn_instance { const QnnInterface_t ** provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d", - QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } QNN_LOG_DEBUG("num_providers=%d\n", num_providers); if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d", num_providers, - _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } @@ -1797,16 +1800,14 @@ class qnn_instance { BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", - lib_path.c_str(), backend_id); + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); int dlclose_error = dlclose(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", - _loaded_lib_handle[backend_id], dlerror()); + QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror()); } } _loaded_lib_handle[backend_id] = lib_handle; @@ -1820,8 +1821,7 @@ class qnn_instance { for (auto & it : _loaded_lib_handle) { dlclose_error = dlclose(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, - dlerror()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror()); } } @@ -1924,7 +1924,6 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, const int64_t ne01 = src0->ne[1]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - // make qnn_get_ggml_tensor_rank and QNN SDK happy if (ne00 <= 1 || ne01 <= 1 || ne10 <= 1 || ne11 <= 1) { return false; @@ -1932,13 +1931,13 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, // TODO: support other GGML OPs using QNN API // a GENERAL approach could fix this problem in a standalone PR of refine ggml backend - // subsystem for mixed inference between CPU&GPU / CPU&NPU easily for ANY ggml backends - // which the backend's ggml_backend_xxx_buffer_is_host return true. - // this approach could be found: + // subsystem for hybrid inference between CPU&GPU / CPU&NPU easily(less the 100 LoC and no + // side-effect to the existing codes) for ANY ggml backends which the backend's + // ggml_backend_xxx_buffer_is_host return true. this approach could be found at: // https://github.com/ggerganov/llama.cpp/pull/7641 bool supported_op = false; supported_op = (tensor->op == GGML_OP_ADD); - supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)); + supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)); if (!supported_op) { return false; } @@ -1950,14 +1949,9 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, } } - int qtype = src0->type; - if (tensor->op == GGML_OP_MUL) { - return (qtype == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32); - } - if (tensor->op == GGML_OP_MUL_MAT) { if (ne00 <= 32 || ne01 <= 32 || ne10 <= 32 || ne11 <= 32) { - //make mul_mat with QNN RPC happy + //comment it for make UT of mul_mat with QNN RPC happy //return false; } } @@ -1965,6 +1959,8 @@ static bool ggml_qnn_can_handle_op(ggml_backend_qnn_context * ctx, return true; } +//TODO: this function can be removed later because there are duplicated codes with ggml_qnn_mul_mat +// keep it for illustrate how to implement a specified GGMPL OP using QNN API + QNN RPC static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { Qnn_ErrorHandle_t error = QNN_SUCCESS; @@ -1986,10 +1982,11 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_add"); perf.start(); - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2034,17 +2031,31 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - /* dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC - */ - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; // 1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -2113,27 +2124,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + qnn_buffer_0 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src0), 4)); if (nullptr == qnn_buffer_0) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_0, tensor_0); memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + qnn_buffer_1 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + qnn_buffer_2 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(dst), 4)); if (nullptr == qnn_buffer_2) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } @@ -2144,23 +2161,33 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; Qnn_OpConfig_t op_config = { (Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_add", QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_ELEMENT_WISE_ADD, 0, qnn_params, - 2, tensor_inputs, 1, - tensor_outputs}}; + .v1 = {"ggml_op_add", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_ELEMENT_WISE_ADD, + 0, qnn_params, + 2, tensor_inputs, + 1,tensor_outputs} + }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs, 2, + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2221,9 +2248,15 @@ static void ggml_qnn_add(ggml_backend_qnn_context * ctx, const ggml_tensor * src Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = qnn_raw_interface.graphExecute(graph_handle, tensor_inputs,2, + error = qnn_raw_interface.graphExecute(graph_handle, + tensor_inputs,2, tensor_outputs,1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2299,6 +2332,8 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_1 = (Qnn_Tensor_t *) src1->extra; tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; + QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; + qnn_perf perf("ggml_qnn_mul_mat"); perf.start(); @@ -2307,7 +2342,6 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, tensor_2 = (Qnn_Tensor_t *) dst->extra; instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; @@ -2338,6 +2372,11 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; + //TODO: for scenarios of quantized data in src0 + // pass-1: dequantize src0 to FP32 + // pass-2: dq-src0 * src1 + // the performance gains is worth although there is performance loss in pass-1 + if (!graph_initialized) { graph_name = graph_name + "_" + std::to_string(ctx->threads) + "_" + src0->name + "_" + src1->name; @@ -2352,17 +2391,31 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, QnnHtpGraph_CustomConfig_t dlbc_config; dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION; - /* dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC; - dlbc_config.optimizationOption.floatValue = 1; // set to 0 to turn off DLBC - */ - dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; - + dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC QnnGraph_Config_t graph_dlbc_config; graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; graph_dlbc_config.customConfig = &dlbc_config; - const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, &graph_dlbc_config, NULL}; + QnnHtpGraph_CustomConfig_t opt_config; + opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG; + opt_config.optimizationOption.floatValue = 1; //1 / 3 + QnnGraph_Config_t graph_opt_config; + graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_opt_config.customConfig = &opt_config; + + QnnHtpGraph_CustomConfig_t vtcm_config; + vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE; + vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb; + QnnGraph_Config_t graph_vtcm_config; + graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM; + graph_vtcm_config.customConfig = &vtcm_config; + + const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config, + &graph_dlbc_config, + &graph_vtcm_config, + &graph_opt_config, + NULL}; error = qnn_raw_interface.graphCreate( instance->get_qnn_context_handle(), graph_name.c_str(), p_graphconfig, &graph_handle); @@ -2428,27 +2481,33 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, uint8_t * qnn_buffer_2 = nullptr; qnn_instance * instance = ctx->instance; - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); + qnn_buffer_0 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src0), 4)); if (nullptr == qnn_buffer_0) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_0, tensor_0); memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); + qnn_buffer_1 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(src1), 4)); if (nullptr == qnn_buffer_1) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } instance->register_rpcmem(qnn_buffer_1, tensor_1); memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); + qnn_buffer_2 = static_cast(instance->alloc_rpcmem( + ggml_nbytes(dst), 4)); if (nullptr == qnn_buffer_2) { QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); + goto failure; } else { QNN_LOG_INFO("alloc rpcmem successfully\n"); } @@ -2457,25 +2516,35 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, - .v1 = {"ggml_op_mul_mat", - QNN_OP_PACKAGE_NAME_QTI_AISW, - QNN_OP_MAT_MUL, 0, qnn_params, 2, - tensor_inputs, 1, tensor_outputs}}; + Qnn_OpConfig_t op_config = { + (Qnn_OpConfigVersion_t) 1, + .v1 = {"ggml_op_mul_mat", + QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_MAT_MUL, + 0, qnn_params, + 2, tensor_inputs, + 1, tensor_outputs} + }; error = qnn_raw_interface.graphAddNode(graph_handle, op_config); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); + error = qnn_raw_interface.graphFinalize(graph_handle, + nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; } error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); + if (ctx->device == QNN_BACKEND_NPU) { + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + } + } if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2537,300 +2606,14 @@ static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, + tensor_inputs, 2, tensor_outputs, 1, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - if (nullptr != qnn_buffer_2) - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - } - -failure: - if (QNN_SUCCESS != error) { - QNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(*tensor_0)); - QNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(*tensor_1)); - QNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(*tensor_2)); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src0->name, src0->type, ggml_type_name(src0->type), - src0->ne[0], src0->ne[1], src0->ne[2], src0->nb[0], - src0->nb[1], src0->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - src1->name, src1->type, ggml_type_name(src1->type), - src1->ne[0], src1->ne[1], src1->ne[2], src1->nb[0], - src1->nb[1], src1->nb[2]); - QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 - " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", - dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], - dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - } - - QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; - QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions; - QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions; - perf.info(); -} - -// common function for GGML OPs using QNN API -static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, - const enum ggml_op ggmlop, - const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - Qnn_ErrorHandle_t error = QNN_SUCCESS; - bool graph_initialized = false; - qnn_instance * instance = nullptr; - std::string qnn_graph_name = "ggml_qnn_graph"; - std::string qnn_op_config_name = "ggml_qnn_op_config"; - const char * qnn_op_name = nullptr; - Qnn_GraphHandle_t graph_handle = nullptr; - Qnn_Tensor_t * tensor_0 = nullptr; - Qnn_Tensor_t * tensor_1 = nullptr; - Qnn_Tensor_t * tensor_2 = nullptr; - Qnn_Param_t qnn_params[] = {}; - Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32; - Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32; - - CHECK_PARAMS(ctx, src0, src1, dst); - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - qnn_perf perf(ggml_op_name(ggmlop)); - perf.start(); - - qnn_op_name = qnn_opname_from_ggmlop(ggmlop); - if (nullptr == qnn_op_name) { - QNN_LOG_WARN("ggml op %d(%s) not supported currently", ggmlop, ggml_op_name(ggmlop)); - return; - } - - tensor_0 = (Qnn_Tensor_t *) src0->extra; - tensor_1 = (Qnn_Tensor_t *) src1->extra; - tensor_2 = (Qnn_Tensor_t *) dst->extra; - instance = ctx->instance; - QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface; - - src0_qnn_type = qnn_datatype_from_ggml_datatype(src0->type); - src1_qnn_type = qnn_datatype_from_ggml_datatype(src1->type); - dst_qnn_type = qnn_datatype_from_ggml_datatype(dst->type); - - QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE; - QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ; - - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], (uint32_t) src0->ne[2], - (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], - (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; - - std::string map_entry = std::string(ggml_op_name(ggmlop)); - if (instance->_qnn_graph_map.find(map_entry) != - instance->_qnn_graph_map.end()) { - graph_initialized = true; - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - } - - uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions; - uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions; - uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions; - - if (!graph_initialized) { - qnn_graph_name = qnn_graph_name + "_" + ggml_op_name(ggmlop) + - std::to_string(ctx->threads) + src0->name + "_" + - src1->name; - qnn_op_config_name = qnn_op_config_name + "_" + ggml_op_name(ggmlop) + - std::to_string(ctx->threads) + src0->name + "_" + - src1->name; - QNN_LOG_DEBUG("qnn graph name %s", qnn_graph_name.c_str()); - QNN_LOG_DEBUG("qnn op_config name %s", qnn_op_config_name.c_str()); - error = qnn_raw_interface.graphCreate( - instance->get_qnn_context_handle(), qnn_graph_name.c_str(), nullptr, - &graph_handle); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("can't create qnn graph handle with ggml op %s, graph " - "name %s, error = %d\n", - ggml_op_name(ggmlop), qnn_graph_name.c_str(), error); - goto failure; - } - - if (ctx->device == QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_0)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_1)->clientBuf= {.data=nullptr, .dataSize=0}; - - QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE; - QNN_VER_PTR(*tensor_2)->clientBuf= {.data=nullptr, .dataSize=0}; - } - - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - } else { - uint8_t * qnn_buffer_0 = nullptr; - uint8_t * qnn_buffer_1 = nullptr; - uint8_t * qnn_buffer_2 = nullptr; - qnn_instance * instance = ctx->instance; - - qnn_buffer_0 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src0), 4)); - if (nullptr == qnn_buffer_0) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); + if (QNN_COMMON_ERROR_SYSTEM_COMMUNICATION == error) { + QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); } - instance->register_rpcmem(qnn_buffer_0, tensor_0); - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - - qnn_buffer_1 = static_cast(instance->alloc_rpcmem(ggml_nbytes(src1), 4)); - if (nullptr == qnn_buffer_1) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_1, tensor_1); - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - - qnn_buffer_2 = static_cast(instance->alloc_rpcmem(ggml_nbytes(dst), 4)); - if (nullptr == qnn_buffer_2) { - QNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno)); - } else { - QNN_LOG_INFO("alloc rpcmem successfully\n"); - } - instance->register_rpcmem(qnn_buffer_2, tensor_2); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - Qnn_OpConfig_t op_config = {(Qnn_OpConfigVersion_t) 1, - .v1 = {qnn_op_config_name.c_str(), - QNN_OP_PACKAGE_NAME_QTI_AISW, - qnn_op_name, 0, qnn_params, 2, - tensor_inputs, 1, tensor_outputs}}; - error = qnn_raw_interface.graphAddNode(graph_handle, op_config); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; - } - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); - if (QNN_SUCCESS != error) { - QNN_LOG_INFO("error = %d\n", error); - goto failure; } - - if (ctx->device == QNN_BACKEND_NPU) { - uint8_t * qnn_buffer_2 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_2)->memHandle)); - memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst)); - } - - auto graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2); - instance->_qnn_graph_map[map_entry] = graph_item; - } else { - auto & graph_item = instance->_qnn_graph_map[map_entry]; - graph_handle = std::get<0>(graph_item); - tensor_0 = std::get<1>(graph_item); - tensor_1 = std::get<2>(graph_item); - tensor_2 = std::get<3>(graph_item); - - uint32_t dimensions_input_0[] = { - (uint32_t) src0->ne[0], (uint32_t) src0->ne[1], - (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]}; - uint32_t dimensions_input_1[] = { - (uint32_t) src1->ne[0], (uint32_t) src1->ne[1], - (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]}; - uint32_t dimensions_output[] = { - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], - (uint32_t) dst->ne[3]}; - QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0; - QNN_VER_PTR(*tensor_0)->rank = qnn_get_ggml_tensor_rank(src0); - QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type; - QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1; - QNN_VER_PTR(*tensor_1)->rank = qnn_get_ggml_tensor_rank(src1); - QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type; - QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output; - QNN_VER_PTR(*tensor_2)->rank = qnn_get_ggml_tensor_rank(dst); - QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type; - - if (ctx->device != QNN_BACKEND_NPU) { - QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, - qnn_get_ggml_tensor_data_size(src0)}; - QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, - qnn_get_ggml_tensor_data_size(src1)}; - QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, - qnn_get_ggml_tensor_data_size(dst)}; - } else { - uint8_t * qnn_buffer_0 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_0)->memHandle)); - if (nullptr != qnn_buffer_0) - memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0)); - - uint8_t * qnn_buffer_1 = static_cast(ctx->instance->get_rpcmem_from_memhandle( - QNN_VER_PTR(*tensor_1)->memHandle)); - if (nullptr != qnn_buffer_1) - memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1)); - } - - Qnn_Tensor_t tensor_inputs[] = {*tensor_0, *tensor_1}; - Qnn_Tensor_t tensor_outputs[] = {*tensor_2}; - error = qnn_raw_interface.graphExecute(graph_handle, - tensor_inputs, 2, - tensor_outputs, 1, - nullptr, nullptr); if (QNN_SUCCESS != error) { QNN_LOG_INFO("error = %d\n", error); goto failure; @@ -2863,8 +2646,6 @@ static void ggml_qnn_hanlde_op(ggml_backend_qnn_context * ctx, " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n", dst->name, dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0], dst->nb[1], dst->nb[2]); - QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], - src0->ne[3]); } QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions; @@ -3038,21 +2819,14 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, struct ggml_compute_params * params, struct ggml_tensor * tensor) { ggml_qnn_func_t func = nullptr; - ggml_qnn_func_common_t func_common = nullptr; switch (tensor->op) { case GGML_OP_ADD: func = ggml_qnn_add; break; - - case GGML_OP_MUL: - func_common = ggml_qnn_hanlde_op; - break; - case GGML_OP_MUL_MAT: func = ggml_qnn_mul_mat; break; - case GGML_OP_REPEAT: func = ggml_qnn_repeat; break; @@ -3062,15 +2836,12 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, case GGML_OP_DUP: func = ggml_qnn_dup; break; - case GGML_OP_ACC: func = ggml_qnn_acc; break; - case GGML_OP_DIV: func = ggml_qnn_div; break; - case GGML_OP_UNARY: switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_GELU: @@ -3169,10 +2940,9 @@ bool ggml_qnn_compute_forward(ggml_backend_qnn_context * ctx, return false; } - if (nullptr != func) func(ctx, tensor->src[0], tensor->src[1], tensor); - - if (nullptr != func_common) - func_common(ctx, tensor->op, tensor->src[0], tensor->src[1], tensor); + if (nullptr != func) { + func(ctx, tensor->src[0], tensor->src[1], tensor); + } return true; } @@ -3221,41 +2991,28 @@ GGML_CALL static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t } Qnn_Tensor_t qnn_tensor = QNN_TENSOR_INIT; - if (ctx->device != QNN_BACKEND_GPU) { - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; - } else { - qnn_tensor = { - .version = QNN_TENSOR_VERSION_1, - {.v1 = {.id = 0, - .name = tensor_name, - .type = qnn_tensor_type, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = qnn_data_type, - .quantizeParams = - {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = qnn_get_ggml_tensor_rank(tensor), - .dimensions = dimensions, - .memType = QNN_TENSORMEMTYPE_MEMHANDLE, - {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; - } + Qnn_TensorMemType_t qnn_mem_type = QNN_TENSORMEMTYPE_RAW; + if (ctx->device == QNN_BACKEND_GPU) { + qnn_mem_type = QNN_TENSORMEMTYPE_MEMHANDLE; + } + + qnn_tensor = { + .version = QNN_TENSOR_VERSION_1, + {.v1 = {.id = 0, + .name = tensor_name, + .type = qnn_tensor_type, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = qnn_data_type, + .quantizeParams = + {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = qnn_get_ggml_tensor_rank(tensor), + .dimensions = dimensions, + .memType = qnn_mem_type, + {.clientBuf = {.data = nullptr, .dataSize = 0}}}}}; + Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t)); if (nullptr == p_qnn_tensor) { diff --git a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh index 4c21be5a41fa2..e12b987b8d69d 100755 --- a/tests/ggml-qnn/ggml-qnn-ut-build-run.sh +++ b/tests/ggml-qnn/ggml-qnn-ut-build-run.sh @@ -12,8 +12,8 @@ ANDROID_PLATFORM=android-34 GGML_QNN_UT=ggml-qnn-ut REMOTE_PATH=/data/local/tmp/ -BUILDTYPE=Debug BUILDTYPE=Release +BUILDTYPE=Debug function dump_vars() @@ -100,7 +100,7 @@ function update_qnn_libs() adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so ${REMOTE_PATH}/ - #the QNN NPU(aka HTP/DSP) backend only verified on Xiaomi14(Qualcomm SM8650-AB Snapdragon 8 Gen 3) successfully + #the QNN NPU(aka HTP) backend only verified on Qualcomm Snapdragon 8 Gen 3 equipped Android phone adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/ adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so ${REMOTE_PATH}/ @@ -142,14 +142,9 @@ function run_ggml_qnn_ut() case "$ggmlop" in GGML_OP_ADD) - echo "adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend" adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_ADD -b $qnnbackend ;; - GGML_OP_MUL) - adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL -b $qnnbackend - ;; - GGML_OP_MUL_MAT) adb shell ${REMOTE_PATH}/${GGML_QNN_UT} -t GGML_OP_MUL_MAT -b $qnnbackend ;; @@ -169,7 +164,6 @@ function show_usage() echo " $0 build (build Android command line UT program)" echo " $0 updateqnnlibs (upload the latest QNN libs to Android phone)" echo " $0 GGML_OP_ADD 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" - echo " $0 GGML_OP_MUL 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo " $0 GGML_OP_MUL_MAT 0 (QNN_CPU) / 1(QNN_GPU) / 2(QNN_NPU) / 3(ggml)" echo -e "\n\n\n" } diff --git a/tests/ggml-qnn/ggml-qnn-ut.cpp b/tests/ggml-qnn/ggml-qnn-ut.cpp index 0abfc62073f08..fa0883af8993e 100644 --- a/tests/ggml-qnn/ggml-qnn-ut.cpp +++ b/tests/ggml-qnn/ggml-qnn-ut.cpp @@ -346,7 +346,7 @@ static void show_usage() { "\nUsage: test_qnn_ops [options]\n" \ "\n" \ "Options:\n" \ - " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \ + " -t GGML_OP_ADD / GGML_OP_MULMAT\n" \ " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(ggml)\n" \ " ?/h print usage infomation\n\n" ); @@ -418,13 +418,9 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { QNN_LOG_DEBUG("sizex: %d\n", sizex); QNN_LOG_DEBUG("sizey: %d\n", sizey); - if (n_ggml_op_type == GGML_OP_MUL) { - src0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - } else { - src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - } + src0 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); + src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); + ggml_set_input(src0); ggml_set_input(src1); @@ -432,9 +428,6 @@ static int qnn_op_ut(int num_threads, int n_backend_type, int n_ggml_op_type) { case GGML_OP_ADD: dst = ggml_add(ctx, src0, src1); break; - case GGML_OP_MUL: - dst = ggml_mul(ctx, src0, src1); - break; case GGML_OP_MUL_MAT: dst = ggml_mul_mat(ctx, src0, src1); break; @@ -518,8 +511,6 @@ int main(int argc, char * argv[]) { n_ggml_op_type = GGML_OP_ADD; } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) { n_ggml_op_type = GGML_OP_MUL_MAT; - } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) { - n_ggml_op_type = GGML_OP_MUL; } else { show_usage(); return 1;