diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc index 573ee1a0..998a176c 100644 --- a/ARM.CMSIS-NN.pdsc +++ b/ARM.CMSIS-NN.pdsc @@ -99,6 +99,7 @@ + diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h index dbe13bf6..7541a1a8 100644 --- a/Include/arm_nnfunctions.h +++ b/Include/arm_nnfunctions.h @@ -21,8 +21,8 @@ * Title: arm_nnfunctions.h * Description: Public header file for CMSIS NN Library * - * $Date: 13 January 2023 - * $Revision: V.11.3.0 + * $Date: 5 September 2023 + * $Revision: V.12.0.0 * * Target : Arm(R) M-Profile Architecture * -------------------------------------------------------------------- */ @@ -1032,7 +1032,10 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dim * C_OUT : Output depth * H & W : Not used. * @param[in, out] output_data Output data pointer. Data type: int8 - * @return The function returns ARM_CMSIS_NN_SUCCESS + * + * @return The function returns either + * ARM_CMSIS_NN_ARG_ERROR if argument constraints fail. or, + * ARM_CMSIS_NN_SUCCESS on successful completion. * * @details * - Supported framework: TensorFlow Lite @@ -1049,8 +1052,24 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, const cmsis_nn_dims *output_dims, int8_t *output_data); +/** + * @brief Calculate vector sums that may be required by arm_fully_connected_s8(). + * @param[in, out] vector_sum_buf Buffer for vector sums + * @param[in] vector_cols Number of vector columns + * @param[in] vector_rows Number of vector rows + * @param[in] vector_data Vector or weigths data + * @return The function returns + * ARM_CMSIS_NN_SUCCESS - Successful operation + * ARM_CMSIS_NN_ARG_ERROR - If not for Arm(R) Helium Architecture case. + */ +arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf, + const int32_t vector_cols, + const int32_t vector_rows, + const int8_t *vector_data); + /** * @brief Get size of additional buffer required by arm_fully_connected_s8(). + * See also arm_vector_sum_s8, which is required if buffer size is > 0. * @param[in] filter_dims dimension of filter * @return The function returns required buffer size in bytes * @@ -1851,6 +1870,15 @@ void arm_concatenation_s8_w(const int8_t *input, /** * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights * + * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function + * definition file to see if an additional buffer is required. + * Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer + * size if an additional buffer is required. + * The caller is expected to clear the buffer ,if applicable, for security reasons. + + * @param[in, out] ctx Function context that contains the additional buffer if required by the function. + * arm_fully_connected_s8_get_buffer_size will return the buffer_size if required. + * The caller is expected to clear the buffer ,if applicable, for security reasons. * @param[in] input_ctx Temporary scratch buffer * The caller is expected to clear the buffer ,if applicable, for security reasons. * @param[in] output_ctx Temporary output scratch buffer @@ -1873,12 +1901,15 @@ void arm_concatenation_s8_w(const int8_t *input, * @param[in] output_dims Output tensor dimensions * @param[out] output_data Pointer to the output tensor * - * @return The function returns ARM_CMSIS_NN_SUCCESS + * @return The function returns either + * ARM_CMSIS_NN_ARG_ERROR if argument constraints fail. or, + * ARM_CMSIS_NN_SUCCESS on successful completion. * * @details * 1. Supported framework: TensorFlow Lite micro */ -arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, +arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx, + const cmsis_nn_context *input_ctx, const cmsis_nn_context *output_ctx, const cmsis_nn_svdf_params *svdf_params, const cmsis_nn_per_tensor_quant_params *input_quant_params, @@ -2012,6 +2043,34 @@ arm_cmsis_nn_status arm_lstm_unidirectional_s16_s8(cmsis_nn_lstm_context *scratc int16_t *cell_state, int8_t *output_data); +/** + * @brief Get size of additional buffer required by arm_svdf_s8(). + * @param[in] filter_dims dimension of filter + * @return The function returns required buffer size in bytes + * + */ +int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *filter_dims); + +/** + * @brief Get size of additional buffer required by arm_svdf_s8() for processors with DSP extension. + * Refer to arm_svdf_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_svdf_s8_get_buffer_size(). + * + */ +int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims); + +/** + * @brief Get size of additional buffer required by arm_svdf_s8() for Arm(R) Helium Architecture case. + * Refer to arm_svdf_s8_get_buffer_size() for function argument details. + * + * @note Intended for compilation on Host. If compiling for an Arm target, use + * arm_svdf_s8_get_buffer_size(). + * + */ +int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims); + #ifdef __cplusplus } #endif diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h index 09f5bc84..04feada6 100644 --- a/Include/arm_nnsupportfunctions.h +++ b/Include/arm_nnsupportfunctions.h @@ -394,6 +394,7 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const int8_t *lhs, */ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, const int8_t *rhs, + const int32_t *kernel_sum, const int32_t *bias, int8_t *dst, const int32_t lhs_offset, diff --git a/README.md b/README.md index 97d0afb1..e713f1f6 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,6 @@ processors here are Cortex-M4 or a Cortex-M33 configured with optional DSP exten Processors with Arm Helium Technology use the Arm M-profile Vector Extension(MVE) instructions for optimization. Examples are Cortex-M55 or Cortex-M85 configured with MVE. - | Operator | C
int8 | C
int16 | DSP
int8 | DSP
int16 | MVE
int8 | MVE
int16 | | --------------- | ----------- | ---------- | ----------- | ------------ | ----------- | ------------ | | Conv2D | Yes | Yes | Yes | Yes | Yes | Yes | diff --git a/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c b/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c index 07618742..34b2abff 100644 --- a/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c +++ b/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c @@ -21,8 +21,8 @@ * Title: arm_fully_connected_get_buffer_sizes_s8.c * Description: Collection of get buffer size functions for fully connected s8 layer function. * - * $Date: 31 January 2023 - * $Revision: V.1.0.0 + * $Date: 15 August 2023 + * $Revision: V.1.1.0 * * Target : Arm(R) M-Profile Architecture * @@ -39,20 +39,24 @@ * @{ */ -int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims) +int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims) { (void)filter_dims; return 0; } -int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims) +int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims) { - return arm_fully_connected_s8_get_buffer_size(filter_dims); + return filter_dims->c * sizeof(int32_t); } -int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims) +int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims) { - return arm_fully_connected_s8_get_buffer_size(filter_dims); +#if defined(ARM_MATH_MVEI) + return arm_fully_connected_s8_get_buffer_size_mve(filter_dims); +#else + return arm_fully_connected_s8_get_buffer_size_dsp(filter_dims); +#endif } /** diff --git a/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/Source/FullyConnectedFunctions/arm_fully_connected_s8.c index 55550c01..77dbe1ff 100644 --- a/Source/FullyConnectedFunctions/arm_fully_connected_s8.c +++ b/Source/FullyConnectedFunctions/arm_fully_connected_s8.c @@ -60,15 +60,24 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, int8_t *output) { (void)bias_dims; - (void)ctx; (void)fc_params->filter_offset; int32_t batch_cnt = input_dims->n; +#if defined(ARM_MATH_MVEI) + if (ctx->buf == NULL) + { + return (ARM_CMSIS_NN_ARG_ERROR); + } +#endif + + const int32_t *kernel_sum = ctx->buf; + while (batch_cnt) { arm_nn_vec_mat_mult_t_s8(input, kernel, + kernel_sum, bias, output, fc_params->input_offset, @@ -80,6 +89,7 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx, fc_params->activation.min, fc_params->activation.max, 1L); + input += filter_dims->n; output += output_dims->c; batch_cnt--; diff --git a/Source/FullyConnectedFunctions/arm_vector_sum_s8.c b/Source/FullyConnectedFunctions/arm_vector_sum_s8.c new file mode 100644 index 00000000..0120ba1d --- /dev/null +++ b/Source/FullyConnectedFunctions/arm_vector_sum_s8.c @@ -0,0 +1,144 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_vector_sum_s8 + * Description: Generic function for calculating vector sums + * + * $Date: 5 September 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" +#include "arm_nnsupportfunctions.h" + +/** + * @ingroup Public + */ + +/** + * @addtogroup FC + * @{ + */ + +/* + * S8 vector sum fuction in preparation for e.g. kernel sums in fully connected and matrix multiplication layer function + * + * Refer header file for details. + * + */ +arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf, + const int32_t vector_cols, + const int32_t vector_rows, + const int8_t *vector_data) +{ +#if defined(ARM_MATH_MVEI) + const int32_t row_loop_cnt = vector_rows / 4; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) + { + const int32_t col_loop_cnt = (vector_cols + 15) / 16; + + const int8_t *vector_0 = vector_data; + const int8_t *vector_1 = vector_data + vector_cols; + const int8_t *vector_2 = vector_data + 2 * vector_cols; + const int8_t *vector_3 = vector_data + 3 * vector_cols; + + int32_t vector_sum_0 = 0; + int32_t vector_sum_1 = 0; + int32_t vector_sum_2 = 0; + int32_t vector_sum_3 = 0; + + uint32_t col_cnt = (uint32_t)vector_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + + const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p); + vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0); + + const int8x16_t ker_1 = vldrbq_z_s8(vector_1, p); + vector_sum_1 = vaddvaq_s8(vector_sum_1, ker_1); + + const int8x16_t ker_2 = vldrbq_z_s8(vector_2, p); + vector_sum_2 = vaddvaq_s8(vector_sum_2, ker_2); + + const int8x16_t ker_3 = vldrbq_z_s8(vector_3, p); + vector_sum_3 = vaddvaq_s8(vector_sum_3, ker_3); + + vector_0 += 16; + vector_1 += 16; + vector_2 += 16; + vector_3 += 16; + } + vector_data += 4 * vector_cols; + + vector_sum_buf[0] = vector_sum_0; + vector_sum_buf[1] = vector_sum_1; + vector_sum_buf[2] = vector_sum_2; + vector_sum_buf[3] = vector_sum_3; + vector_sum_buf += 4; + } + + const int32_t loop_cnt = vector_rows % 4; + + for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) + { + const int32_t col_loop_cnt = (vector_cols + 15) / 16; + + const int8_t *vector_0 = vector_data; + + int32_t vector_sum_0 = 0; + + uint32_t col_cnt = (uint32_t)vector_cols; + + for (int i = 0; i < col_loop_cnt; i++) + { + mve_pred16_t p = vctp8q(col_cnt); + col_cnt -= 16; + + const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p); + vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0); + + vector_0 += 16; + } + vector_data += vector_cols; + + vector_sum_buf[i_row_loop_cnt] = vector_sum_0; + } + + return (ARM_CMSIS_NN_SUCCESS); +#else + (void)vector_sum_buf; + (void)vector_rows; + (void)vector_cols; + (void)vector_data; + + return (ARM_CMSIS_NN_NO_IMPL_ERROR); +#endif +} + +/** + * @} end of FC group + */ diff --git a/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c index 8568676c..1287d00a 100644 --- a/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c +++ b/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c @@ -57,6 +57,7 @@ #endif arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, const int8_t *rhs, + const int32_t *kernel_sum, const int32_t *bias, int8_t *dst, const int32_t lhs_offset, @@ -70,7 +71,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, const int32_t address_offset) { #if defined(ARM_MATH_MVEI) - const int32_t row_loop_cnt = rhs_rows / 3; + const int32_t row_loop_cnt = rhs_rows / 4; const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3}; for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++) @@ -78,6 +79,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, int32_t acc_0 = 0; int32_t acc_1 = 0; int32_t acc_2 = 0; + int32_t acc_3 = 0; const int32_t col_loop_cnt = (rhs_cols + 15) / 16; @@ -85,15 +87,14 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, const int8_t *rhs_0 = rhs; const int8_t *rhs_1 = rhs + rhs_cols; const int8_t *rhs_2 = rhs + 2 * rhs_cols; + const int8_t *rhs_3 = rhs + 3 * rhs_cols; - int32_t rhs_sum_0 = 0; - int32_t rhs_sum_1 = 0; - int32_t rhs_sum_2 = 0; if (bias) { acc_0 = *bias++; acc_1 = *bias++; acc_2 = *bias++; + acc_3 = *bias++; } uint32_t col_cnt = (uint32_t)rhs_cols; @@ -106,53 +107,48 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, const int8x16_t input = vldrbq_z_s8(lhs_vec, p); const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); - rhs_sum_0 = vaddvaq_s8(rhs_sum_0, ker_0); acc_0 = vmladavaq_s8(acc_0, ker_0, input); const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p); - rhs_sum_1 = vaddvaq_s8(rhs_sum_1, ker_1); acc_1 = vmladavaq_s8(acc_1, ker_1, input); const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p); - rhs_sum_2 = vaddvaq_s8(rhs_sum_2, ker_2); acc_2 = vmladavaq_s8(acc_2, ker_2, input); + const int8x16_t ker_3 = vldrbq_z_s8(rhs_3, p); + acc_3 = vmladavaq_s8(acc_3, ker_3, input); + lhs_vec += 16; rhs_0 += 16; rhs_1 += 16; rhs_2 += 16; + rhs_3 += 16; } - rhs += 3 * rhs_cols; + rhs += 4 * rhs_cols; - int32x4_t acc = {acc_0, acc_1, acc_2, 0}; - const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0}; + int32x4_t acc = {acc_0, acc_1, acc_2, acc_3}; + + const int32x4_t rhs_sum = {kernel_sum[0], kernel_sum[1], kernel_sum[2], kernel_sum[3]}; acc += vdupq_n_s32(lhs_offset) * rhs_sum; + kernel_sum += 4; acc = arm_requantize_mve(acc, dst_multiplier, dst_shift); acc = vaddq_s32(acc, vdupq_n_s32(dst_offset)); acc = vmaxq_s32(acc, vdupq_n_s32(activation_min)); acc = vminq_s32(acc, vdupq_n_s32(activation_max)); - const mve_pred16_t p = vctp32q(3); - if (address_offset > 1L) - { - vstrbq_scatter_offset_p_s32(dst, address_offset_array, acc, p); - } - else - { - vstrbq_p_s32(dst, acc, p); - } - dst += 3 * address_offset; + vstrbq_scatter_offset_s32(dst, address_offset_array, acc); + + dst += 4 * address_offset; } - const int loop_cnt = rhs_rows % 3; + const int loop_cnt = rhs_rows % 4; for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++) { int32_t acc_0 = 0; const int32_t col_loop_cnt = (rhs_cols + 15) / 16; const int8_t *lhs_vec = lhs; const int8_t *rhs_0 = rhs; - int32_t rhs_sum_0 = 0; uint32_t col_cnt = (uint32_t)rhs_cols; for (int i = 0; i < col_loop_cnt; i++) @@ -162,7 +158,6 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, const int8x16_t input = vldrbq_z_s8(lhs_vec, p); const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p); - rhs_sum_0 = vaddvaq_s8(rhs_sum_0, ker_0); acc_0 = vmladavaq_s8(acc_0, ker_0, input); lhs_vec += 16; @@ -175,7 +170,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, acc_0 += *bias; bias++; } - const int32_t offsets = rhs_sum_0 * lhs_offset; + const int32_t rhs_sum = kernel_sum[i_row_loop_cnt]; + const int32_t offsets = rhs_sum * lhs_offset; acc_0 += offsets; acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift); acc_0 += dst_offset; @@ -187,6 +183,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, } #elif defined(ARM_MATH_DSP) + (void)kernel_sum; + const int32_t row_loop_cnt = rhs_rows / 2; const int16_t lhs_offset_s16 = (int16_t)lhs_offset; const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16); @@ -302,6 +300,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs, } #else + (void)kernel_sum; const int32_t row_loop_cnt = rhs_rows / 3; diff --git a/Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c b/Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c new file mode 100644 index 00000000..44b43757 --- /dev/null +++ b/Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c @@ -0,0 +1,64 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* ---------------------------------------------------------------------- + * Project: CMSIS NN Library + * Title: arm_svdf_get_buffer_sizes_s8.c + * Description: Collection of get buffer size functions for svdf s8 layer function. + * + * $Date: 5 September 2023 + * $Revision: V.1.0.0 + * + * Target : Arm(R) M-Profile Architecture + * + * -------------------------------------------------------------------- */ + +#include "arm_nnfunctions.h" + +/** + * @ingroup SVDF + */ + +/** + * @addtogroup GetBufferSizeSVDF + * @{ + */ + +int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *weights_feature_dims) +{ + (void)weights_feature_dims; + return 0; +} + +int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *weights_feature_dims) +{ + return weights_feature_dims->n * sizeof(int32_t); +} + +int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *weights_feature_dims) +{ +#if defined(ARM_MATH_MVEI) + return arm_svdf_s8_get_buffer_size_mve(weights_feature_dims); +#else + return arm_svdf_s8_get_buffer_size_dsp(weights_feature_dims); +#endif +} + +/** + * @} end of GetBufferSizeSVDF group + */ diff --git a/Source/SVDFunctions/arm_svdf_s8.c b/Source/SVDFunctions/arm_svdf_s8.c index faab5a49..a97ce3ae 100644 --- a/Source/SVDFunctions/arm_svdf_s8.c +++ b/Source/SVDFunctions/arm_svdf_s8.c @@ -21,8 +21,8 @@ * Title: arm_svdf_s8.c * Description: S8 basic SVDF layer function * - * $Date: 5 January 2023 - * $Revision: V.5.1.0 + * $Date: 5 September 2023 + * $Revision: V.6.0.0 * * Target : Arm(R) M-Profile Architecture * @@ -47,7 +47,8 @@ * */ -arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, +arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx, + const cmsis_nn_context *input_ctx, const cmsis_nn_context *output_ctx, const cmsis_nn_svdf_params *svdf_params, const cmsis_nn_per_tensor_quant_params *input_quant_params, @@ -69,6 +70,13 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, (void)state_dims; (void)output_dims; +#if defined(ARM_MATH_MVEI) + if (ctx->buf == NULL) + { + return (ARM_CMSIS_NN_ARG_ERROR); + } +#endif + const int32_t multiplier_in = input_quant_params->multiplier; const int32_t shift_in = input_quant_params->shift; const int32_t multiplier_out = output_quant_params->multiplier; @@ -99,6 +107,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, } int32_t *buffer_b = (int32_t *)output_ctx->buf; + int32_t *kernel_sum_data = (int32_t *)ctx->buf; + // Left shift state memmove((int8_t *)state_data, (int8_t *)state_data + 1, @@ -108,11 +118,11 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx, for (int i_batch = 0; i_batch < input_batches; i_batch++) { int8_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1); - const int8_t *weight = weights_feature_data; const int8_t *input = input_data + i_batch * input_height; arm_cmsis_nn_status res = arm_nn_vec_mat_mult_t_s8(input, - weight, + weights_feature_data, + kernel_sum_data, NULL, res_ptr, -zp_in, diff --git a/Tests/UnitTest/TestCases/TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h b/Tests/UnitTest/TestCases/TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h new file mode 100644 index 00000000..917f9f58 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h @@ -0,0 +1,24 @@ +/* + * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +// This is the output of arm_vector_sum_s8() from the ds_cnn_s model. +int32_t ds_cnn_s_layer_12_fully_connected_kernel_sums[12] = + {-2931, 80, -521, -390, -576, -255, -464, -470, -486, -502, -490, -234}; diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/biases_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/biases_data.h new file mode 100644 index 00000000..ce036577 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/biases_data.h @@ -0,0 +1,6 @@ +// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0). +// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None. +#pragma once +#include + +const int32_t svdf_int8_2_biases[13] = {-108, -78, -29, -5, 25, -113, 122, -68, -32, -57, -59, -14, 13}; diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/config_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/config_data.h new file mode 100644 index 00000000..b9491339 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/config_data.h @@ -0,0 +1,19 @@ +// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0). +// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None. +#pragma once +#define SVDF_INT8_2_MULTIPLIER_IN 1717987072 +#define SVDF_INT8_2_MULTIPLIER_OUT 1099511552 +#define SVDF_INT8_2_SHIFT_1 -3 +#define SVDF_INT8_2_SHIFT_2 -11 +#define SVDF_INT8_2_IN_ACTIVATION_MIN -128 +#define SVDF_INT8_2_IN_ACTIVATION_MAX 127 +#define SVDF_INT8_2_RANK 2 +#define SVDF_INT8_2_FEATURE_BATCHES 26 +#define SVDF_INT8_2_TIME_BATCHES 3 +#define SVDF_INT8_2_INPUT_SIZE 40 +#define SVDF_INT8_2_DST_SIZE 26 +#define SVDF_INT8_2_OUT_ACTIVATION_MIN -128 +#define SVDF_INT8_2_OUT_ACTIVATION_MAX 127 +#define SVDF_INT8_2_INPUT_BATCHES 2 +#define SVDF_INT8_2_INPUT_OFFSET -12 +#define SVDF_INT8_2_OUTPUT_OFFSET 0 diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/input_sequence_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/input_sequence_data.h new file mode 100644 index 00000000..92b96632 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/input_sequence_data.h @@ -0,0 +1,18 @@ +// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0). +// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None. +#pragma once +#include + +const int8_t svdf_int8_2_input_sequence[240] = { + -5, -114, -7, -41, 22, -57, 100, -105, -75, -113, 55, -61, -87, -46, -6, 28, -105, 87, 95, -72, + -85, -49, -71, 67, -29, -99, 22, -47, -78, -92, -114, -85, -10, -9, 26, 62, -6, -106, -96, -117, + -90, 16, -90, -15, 63, -3, 63, 24, 119, -25, -19, 38, 119, 12, -25, -32, 125, 3, -127, 40, + -85, 27, 80, -73, -9, -14, -95, -64, 15, -99, 88, 39, 32, -7, -18, 38, -93, -81, -65, 108, + -33, -31, -3, -104, 103, 108, 116, -33, 122, -128, 90, -28, 116, 93, -107, 122, 118, -10, -124, -43, + -1, 12, -48, 117, -54, -42, 57, 106, -48, -12, 90, 121, -52, 17, -73, -127, 34, -72, 72, -97, + 2, 75, -82, 45, -3, -42, 82, -71, -109, -75, -101, 70, -128, -128, -79, -96, 61, 59, -4, -95, + -92, 77, -7, 67, 99, 110, 49, -102, -17, -103, -13, -6, -118, -23, -33, -68, -25, -50, 68, -68, + 57, 120, 74, 50, 97, 36, -114, -120, -73, -70, -23, -10, 81, 22, 74, 43, -112, 35, 122, -65, + -1, 95, 69, -27, 84, 99, -67, -71, -119, 89, 43, -107, -74, 105, -48, -25, 86, 109, -92, -29, + 80, 44, -61, 15, 119, 109, -28, -67, 84, -76, 65, 95, 60, -110, -97, 32, -52, -127, -46, 78, + 42, 116, -113, -26, 96, 27, 93, -70, -41, -11, 34, 53, -39, 126, -65, -28, -83, 4, 14, -98}; diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/output_ref_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/output_ref_data.h new file mode 100644 index 00000000..87d60d88 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/output_ref_data.h @@ -0,0 +1,7 @@ +// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0). +// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None. +#pragma once +#include + +const int8_t svdf_int8_2_output_ref[26] = {5, -9, 0, 7, 2, -3, 5, 0, 2, -10, 1, -5, -1, + -5, -9, 0, 1, 3, -2, 9, -3, 5, 4, 4, 7, 9}; diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/state_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/state_data.h new file mode 100644 index 00000000..8653560e --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/state_data.h @@ -0,0 +1,11 @@ +// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0). +// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None. +#pragma once +#include + +const int8_t svdf_int8_2_state[156] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/test_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/test_data.h new file mode 100644 index 00000000..0d467e1d --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/test_data.h @@ -0,0 +1,9 @@ +// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0). +// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None. +#include "biases_data.h" +#include "config_data.h" +#include "input_sequence_data.h" +#include "output_ref_data.h" +#include "state_data.h" +#include "weights_feature_data.h" +#include "weights_time_data.h" diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_feature_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_feature_data.h new file mode 100644 index 00000000..b91c1b1f --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_feature_data.h @@ -0,0 +1,61 @@ +// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0). +// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None. +#pragma once +#include + +const int8_t svdf_int8_2_weights_feature[1040] = { + -94, -100, -91, -31, 60, -96, -93, 84, 16, -64, -79, -109, 48, 65, 118, 117, -110, -19, 29, + 83, 21, 27, -85, -40, -16, 31, 52, 79, 91, 14, -62, 100, 30, 6, -83, -44, -75, -44, + -45, -108, 7, 52, 15, -101, -30, -106, -62, 20, -51, 102, 74, -75, 54, -112, -1, -99, 122, + -113, -33, -29, 84, 102, -125, -31, -66, -34, -61, -80, 3, 66, -84, -112, -116, 19, 88, -113, + -37, -3, -82, 78, 22, 105, -97, 117, -79, 99, 78, -61, 68, 106, 99, 12, 20, 68, 16, + -26, -2, 104, 108, 115, -59, -78, -32, 78, -51, -13, 51, -81, -7, 74, 55, -13, -71, 48, + -14, -37, 43, -27, -115, -75, -22, 92, 99, 11, -31, -23, 24, 47, 41, -29, -71, -78, -92, + 0, 13, 36, -32, -47, 15, 37, -37, -76, 68, -105, 28, 91, 107, 123, -40, 123, 96, -43, + 13, -113, 55, -124, -31, -25, -118, 92, 66, -13, 43, 65, -108, 93, 47, 71, -57, -85, 3, + -95, 100, 18, 60, -92, 43, 79, -94, 106, -26, -66, -72, -74, -8, 47, -57, -49, 13, -69, + 28, 62, 90, 18, -52, 6, -70, 22, 109, -114, 18, -32, -108, 16, -31, 101, -103, 106, 122, + 115, -94, 35, -31, -105, -26, -107, 58, -19, 99, 124, 27, -16, -10, 48, 18, 65, -30, 56, + 119, -40, -107, -27, -83, 107, 117, 56, -30, -19, 68, 117, 29, -95, -95, 76, 65, 19, 105, + -51, 87, -62, 91, -84, -113, 106, -116, 57, 67, 71, -23, -79, -61, -56, -100, -86, -74, 114, + -50, -110, 107, -70, -47, -65, 56, 90, -86, 8, 1, -39, 41, 22, 24, -87, -104, 43, -17, + -102, -114, 87, -117, -106, -101, 4, 78, -64, 115, 88, 70, -66, -42, -46, 80, -45, -78, -43, + 113, -31, 58, -89, 49, -83, -111, -126, 126, 100, -87, -13, 122, -94, -90, -91, -13, 5, 105, + -75, 25, -12, -3, 101, -72, -55, 54, 112, 49, 0, 92, -114, -75, -16, 20, -24, 29, -34, + 114, 101, -69, -92, 68, -25, 95, 117, 87, 71, 106, 42, -108, -118, 21, 112, -10, 43, -71, + -65, 112, 61, 40, 2, 76, 21, 32, 7, 26, -101, 70, 83, 29, 44, -89, 44, 58, 104, + 54, 111, 17, 42, 6, 40, 63, -61, 41, 103, 30, -52, 0, -118, -3, 0, -74, 62, 3, + 91, 90, 46, 88, 62, -97, 73, 55, -116, -104, 57, 114, -104, 71, -80, 61, -120, -18, 48, + -22, 24, 14, 110, 86, 60, -60, -80, -110, 101, -22, -38, -3, -30, 66, 28, -64, 101, -79, + 31, 39, -82, -58, 111, 33, 73, -21, 123, -104, -35, -111, -70, 24, 118, 48, -60, -11, -70, + 18, 27, 106, 56, 82, 44, -43, -13, 103, 84, -114, 56, -38, -4, 15, -90, -17, -23, -58, + -21, -112, 13, -118, -5, 32, -81, -78, 55, -26, 96, -66, -63, -94, -120, 66, 99, 15, -126, + 97, 49, 38, -22, 97, -36, -111, -106, 86, -75, 87, 89, -107, -56, -36, 126, 29, 40, 35, + 12, -122, -35, -42, 21, -99, -27, 5, 45, 56, -38, 37, -88, -96, -116, 109, 90, 101, 43, + 38, -123, 102, -109, -41, -22, -113, 90, 43, -12, -126, 27, 82, -76, -29, -80, 10, 21, 53, + 87, -3, 30, 99, -55, 32, -72, -5, -116, 2, -70, 122, 51, 50, 45, -14, -85, 89, 13, + 62, 108, 42, 22, -126, 117, -28, -127, -3, -26, 100, -63, -65, 17, 41, 6, 100, -93, 9, + 115, 12, 20, 31, 14, -9, 81, -81, -87, 37, -96, -72, 15, 52, 82, -121, -1, -26, -120, + -48, 121, -21, -107, -97, 44, 56, -48, -52, -105, 101, 53, 56, -11, 77, 124, 102, 67, -19, + 120, -67, -85, -2, 76, 24, 9, -124, -54, -62, -73, 110, -96, -64, 112, 10, 44, 78, -59, + -82, 111, 35, -39, 19, -25, -16, -28, -39, -108, 93, -31, 98, -88, 33, 82, 62, 112, -121, + -109, 104, 66, 93, -46, 11, 52, 31, -58, -70, -93, 99, -83, -56, 118, -109, 5, 25, 111, + -30, -100, -69, -22, -26, 97, -78, 76, 21, 24, 83, -77, 80, -42, 111, -10, -113, 104, 112, + -77, 92, -82, 106, 125, 17, -81, -1, -86, -102, 1, 63, -46, -105, 84, 55, 22, -53, -18, + 110, 15, 35, 5, -40, 81, 71, 100, 93, -3, 81, -53, 66, -57, 56, -40, -49, -42, 20, + -100, 43, 93, -13, 89, -121, -101, 113, 20, 74, -86, -77, 93, -109, 6, 15, 91, -75, -66, + 31, 28, 62, 41, -95, -119, -4, -104, 34, 2, -26, -118, 22, -62, -102, 19, -73, 29, -126, + -79, -96, 52, 97, 83, -11, -23, 61, 60, -77, 7, 71, -36, 106, -93, -1, -126, -64, 35, + -124, 37, 90, -59, 6, 2, -2, 2, 7, 11, -83, -117, -67, 95, -25, 43, 105, -20, -9, + -38, 84, 108, -55, -115, -46, -68, 112, -93, 109, -61, -32, -62, 41, -37, 28, -13, 8, -51, + 114, 43, 39, 39, -37, 17, 53, 126, 41, 79, 99, -80, -96, -15, 64, 85, -124, 94, -6, + 19, -54, -83, -66, 78, 54, 16, -108, 69, -65, -74, -6, 120, -121, -38, -85, -12, 5, 20, + -91, 33, -63, 90, -84, -21, 111, 110, 98, -49, -125, -74, 48, -52, -98, 102, 64, -87, -39, + 16, 75, 82, 40, -39, -96, 8, -92, 101, 82, 38, -46, -83, -76, 99, -79, 28, 100, 76, + 123, 81, -61, 39, 94, -87, 121, 112, 59, -13, 48, 21, -17, -17, -81, 91, -52, 57, -117, + -100, 99, 92, 95, 8, -83, -68, -62, -21, 100, -4, -23, -42, 36, -94, 58, 5, 35, -70, + 45, 126, 112, 75, 110, -73, 84, 70, -108, -83, -36, -11, -110, -79, -34, -18, 11, -84, 53, + 84, 10, 49, -17, -62, -72, 103, 122, -66, 102, -7, 71, 5, -78, -49, 91, 39, -98, -74, + 71, 74, -12, 17, 1, -98, -28, 11, 12, -91, -69, 15, 3, -104, 109, 12, 48, -64, -56, + -5, -33, -74, -33, -8, -63, 88, -126, -61, 70, -98, -12, 15, 19, 29, -107, 107, -8, -24, + 55, 26, 119, -43, 87, 4, 48, -11, -126, -40, 107, 122, -92, -27}; diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_time_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_time_data.h new file mode 100644 index 00000000..e3eff344 --- /dev/null +++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_time_data.h @@ -0,0 +1,10 @@ +// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0). +// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None. +#pragma once +#include + +const int8_t svdf_int8_2_weights_time[78] = { + 98, 57, 10, 25, 18, -46, 8, -55, -128, 57, 49, -13, 3, -5, 101, -128, -13, 45, 67, 13, + -49, 111, 73, 48, -113, -17, -83, -49, 71, 4, -74, 86, 54, -29, 6, -20, 94, 124, 113, -69, + 34, 21, 110, 71, 68, 8, 80, 38, -94, 82, -20, 4, 10, -124, 86, 110, -33, -3, 12, -99, + -116, -107, 74, 45, -44, 51, 114, 78, -116, 82, -48, -100, -107, 84, 110, -118, 32, 105}; diff --git a/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c b/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c index 9d26e64c..1e6eafd9 100644 --- a/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c +++ b/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -469,6 +469,10 @@ void ds_cnn_l_s8_inference(void) bias_dims.c = in_out_dim_1.c; +#if defined(ARM_MATH_MVEI) + arm_vector_sum_s8(ctx.buf, conv_filter_dims.n, in_out_dim_1.c, ds_cnn_l_layer_14_fully_connected_weights); +#endif + status |= arm_fully_connected_s8(&ctx, &fc_params, &per_tensor_quant_params, diff --git a/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c b/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c index 21fb972d..5862df7b 100644 --- a/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c +++ b/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -19,6 +19,7 @@ #include "arm_nnfunctions.h" #include "unity.h" +#include "../TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h" #include "../TestData/ds_cnn_s/test_data.h" #include "../Utils/validate.h" @@ -107,6 +108,7 @@ void ds_cnn_s_s8_inference(void) /* Test for a complete int8 DS_CNN_S keyword spotting network from https://github.com/ARM-software/ML-zoo & * Tag: 22.02 */ cmsis_nn_context ctx; + cmsis_nn_context ctx_kernel_sum; const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS; ctx.size = ds_cnn_s_s8_get_buffer_size(); @@ -408,7 +410,13 @@ void ds_cnn_s_s8_inference(void) bias_dims.c = in_out_dim_1.c; - status |= arm_fully_connected_s8(&ctx, +#if defined(ARM_MATH_MVEI) + ctx_kernel_sum.buf = ds_cnn_s_layer_12_fully_connected_kernel_sums; +#else + ctx_kernel_sum = ctx; +#endif + + status |= arm_fully_connected_s8(&ctx_kernel_sum, &fc_params, &per_tensor_quant_params, &in_out_dim_0, diff --git a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c index 75393c34..00575a14 100644 --- a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c +++ b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -64,10 +64,15 @@ void fully_connected_arm_fully_connected_s8(void) quant_params.multiplier = FULLY_CONNECTED_OUTPUT_MULTIPLIER; quant_params.shift = FULLY_CONNECTED_OUTPUT_SHIFT; - int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); + const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); ctx.buf = malloc(buf_size); ctx.size = buf_size; +#if defined(ARM_MATH_MVEI) + int32_t *buf = ctx.buf; + TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data)); +#endif + arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx, &fc_params, &quant_params, @@ -122,9 +127,15 @@ void fully_connected_mve_0_arm_fully_connected_s8(void) quant_params.multiplier = FULLY_CONNECTED_MVE_0_OUTPUT_MULTIPLIER; quant_params.shift = FULLY_CONNECTED_MVE_0_OUTPUT_SHIFT; - int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); + const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); ctx.buf = malloc(buf_size); ctx.size = buf_size; + +#if defined(ARM_MATH_MVEI) + int32_t *buf = ctx.buf; + TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data)); +#endif + arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx, &fc_params, &quant_params, @@ -178,9 +189,15 @@ void fully_connected_mve_1_arm_fully_connected_s8(void) quant_params.multiplier = FULLY_CONNECTED_MVE_1_OUTPUT_MULTIPLIER; quant_params.shift = FULLY_CONNECTED_MVE_1_OUTPUT_SHIFT; - int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); + const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); ctx.buf = malloc(buf_size); ctx.size = buf_size; + +#if defined(ARM_MATH_MVEI) + int32_t *buf = ctx.buf; + TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data)); +#endif + arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx, &fc_params, &quant_params, @@ -245,9 +262,15 @@ void fully_connected_null_bias_0_arm_fully_connected_s8(void) } TEST_ASSERT_EQUAL(expected, ip_check); - int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); + const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); ctx.buf = malloc(buf_size); ctx.size = buf_size; + +#if defined(ARM_MATH_MVEI) + int32_t *buf = ctx.buf; + TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data)); +#endif + arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx, &fc_params, &quant_params, @@ -301,9 +324,15 @@ void fully_connected_out_activation_arm_fully_connected_s8(void) quant_params.multiplier = FULLY_CONNECTED_OUT_ACTIVATION_OUTPUT_MULTIPLIER; quant_params.shift = FULLY_CONNECTED_OUT_ACTIVATION_OUTPUT_SHIFT; - int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); + const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims); ctx.buf = malloc(buf_size); ctx.size = buf_size; + +#if defined(ARM_MATH_MVEI) + int32_t *buf = ctx.buf; + TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data)); +#endif + arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx, &fc_params, &quant_params, diff --git a/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c b/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c index ff493a4f..6db62bf9 100644 --- a/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c +++ b/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c @@ -45,3 +45,4 @@ void setUp(void) void tearDown(void) {} void test_svdf_int8_arm_s8(void) { svdf_int8_arm_svdf_s8(); } +void test_svdf_int8_2_arm_s8(void) { svdf_int8_2_arm_svdf_s8(); } diff --git a/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c b/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c index b71d5a5d..43f7d26e 100644 --- a/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c +++ b/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c @@ -20,6 +20,7 @@ #include #include "../TestData/svdf_int8/test_data.h" +#include "../TestData/svdf_int8_2/test_data.h" #include "../Utils/validate.h" #define REPEAT_NUM (1) @@ -68,6 +69,16 @@ void svdf_int8_arm_svdf_s8(void) const int scratch_size = SVDF_INT8_INPUT_BATCHES * SVDF_INT8_FEATURE_BATCHES * sizeof(int32_t); const int scratch_size_out = SVDF_INT8_INPUT_BATCHES * number_units * sizeof(int32_t); + cmsis_nn_context ctx; + const int32_t buf_size = arm_svdf_s8_get_buffer_size(&weights_feature_dims); + ctx.buf = malloc(buf_size); + ctx.size = buf_size; + +#if defined(ARM_MATH_MVEI) + int32_t *kernel_sum_buf = ctx.buf; + arm_vector_sum_s8(kernel_sum_buf, input_dims.h, weights_feature_dims.n, weights_feature_data); +#endif + // + SVDF_INT8_TIME_BATCHES additional bytes to make sure it is not overwritten const int state_data_size = sizeof(svdf_int8_state) + SVDF_INT8_TIME_BATCHES; const int8_t initial_data = 66; @@ -86,7 +97,8 @@ void svdf_int8_arm_svdf_s8(void) for (int j = 0; j < number_inputs; j++) { memcpy(input_data, svdf_int8_input_sequence + j * input_round_size, input_round_size); - arm_cmsis_nn_status result = arm_svdf_s8(&input_ctx, + arm_cmsis_nn_status result = arm_svdf_s8(&ctx, + &input_ctx, &output_ctx, &svdf_int8_params, &input_quant_params, @@ -109,6 +121,13 @@ void svdf_int8_arm_svdf_s8(void) TEST_ASSERT_TRUE(validate(output_data, output_ref, output_ref_size)); } + if (ctx.buf) + { + // The caller is responsible to clear the scratch buffers for security reasons if applicable. + memset(ctx.buf, 0, buf_size); + free(ctx.buf); + } + // Make sure state data is not written outside boundary for (int i = sizeof(svdf_int8_state); i < state_data_size; i++) { @@ -120,3 +139,108 @@ void svdf_int8_arm_svdf_s8(void) free(input_ctx.buf); free(output_ctx.buf); } + +void svdf_int8_2_arm_svdf_s8(void) +{ + const int32_t output_ref_size = SVDF_INT8_2_DST_SIZE; + const int8_t *output_ref = svdf_int8_2_output_ref; + const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS; + cmsis_nn_context input_ctx; + cmsis_nn_context output_ctx; + cmsis_nn_svdf_params svdf_int8_2_params; + cmsis_nn_dims input_dims; + cmsis_nn_dims weights_feature_dims; + cmsis_nn_dims weights_time_dims; + cmsis_nn_dims state_dims; + cmsis_nn_dims output_dims; + cmsis_nn_dims bias_dims; + cmsis_nn_per_tensor_quant_params input_quant_params; + cmsis_nn_per_tensor_quant_params output_quant_params; + int8_t output_data[SVDF_INT8_2_DST_SIZE] = {1}; + const int8_t *weights_feature_data = svdf_int8_2_weights_feature; + const int8_t *weights_time_data = svdf_int8_2_weights_time; + + input_dims.n = SVDF_INT8_2_INPUT_BATCHES; + input_dims.h = SVDF_INT8_2_INPUT_SIZE; + weights_feature_dims.n = SVDF_INT8_2_FEATURE_BATCHES; + weights_time_dims.h = SVDF_INT8_2_TIME_BATCHES; + + input_quant_params.multiplier = SVDF_INT8_2_MULTIPLIER_IN; + input_quant_params.shift = SVDF_INT8_2_SHIFT_1; + output_quant_params.multiplier = SVDF_INT8_2_MULTIPLIER_OUT; + output_quant_params.shift = SVDF_INT8_2_SHIFT_2; + + svdf_int8_2_params.input_activation.min = SVDF_INT8_2_IN_ACTIVATION_MIN; + svdf_int8_2_params.input_activation.max = SVDF_INT8_2_IN_ACTIVATION_MAX; + svdf_int8_2_params.output_activation.min = SVDF_INT8_2_OUT_ACTIVATION_MIN; + svdf_int8_2_params.output_activation.max = SVDF_INT8_2_OUT_ACTIVATION_MAX; + svdf_int8_2_params.input_offset = SVDF_INT8_2_INPUT_OFFSET; + svdf_int8_2_params.output_offset = SVDF_INT8_2_OUTPUT_OFFSET; + svdf_int8_2_params.rank = SVDF_INT8_2_RANK; + + const int input_round_size = SVDF_INT8_2_INPUT_BATCHES * SVDF_INT8_2_INPUT_SIZE; + const int number_inputs = sizeof(svdf_int8_2_input_sequence) / input_round_size; + const int32_t number_units = SVDF_INT8_2_FEATURE_BATCHES / SVDF_INT8_2_RANK; + const int scratch_size = SVDF_INT8_2_INPUT_BATCHES * SVDF_INT8_2_FEATURE_BATCHES * sizeof(int32_t); + const int scratch_size_out = SVDF_INT8_2_INPUT_BATCHES * number_units * sizeof(int32_t); + + cmsis_nn_context ctx; + const int32_t buf_size = arm_svdf_s8_get_buffer_size(&weights_feature_dims); + ctx.buf = malloc(buf_size); + ctx.size = buf_size; + +#if defined(ARM_MATH_MVEI) + int32_t *kernel_sum_buf = ctx.buf; + arm_vector_sum_s8(kernel_sum_buf, input_dims.h, weights_feature_dims.n, weights_feature_data); +#endif + + const int state_data_size = sizeof(svdf_int8_2_state); + + input_ctx.buf = malloc(scratch_size); + output_ctx.buf = malloc(scratch_size_out); + + int8_t *input_data = malloc(input_round_size); + int8_t *state_data = malloc(state_data_size); + + for (int i = 0; i < REPEAT_NUM; i++) + { + memcpy(state_data, svdf_int8_2_state, sizeof(svdf_int8_2_state)); + for (int j = 0; j < number_inputs; j++) + { + memcpy(input_data, svdf_int8_2_input_sequence + j * input_round_size, input_round_size); + arm_cmsis_nn_status result = arm_svdf_s8(&ctx, + &input_ctx, + &output_ctx, + &svdf_int8_2_params, + &input_quant_params, + &output_quant_params, + &input_dims, + input_data, + &state_dims, + state_data, + &weights_feature_dims, + weights_feature_data, + &weights_time_dims, + weights_time_data, + &bias_dims, + svdf_int8_2_biases, + &output_dims, + output_data); + TEST_ASSERT_EQUAL(expected, result); + } + + TEST_ASSERT_TRUE(validate(output_data, output_ref, output_ref_size)); + } + + if (ctx.buf) + { + // The caller is responsible to clear the scratch buffers for security reasons if applicable. + memset(ctx.buf, 0, buf_size); + free(ctx.buf); + } + + free(state_data); + free(input_data); + free(input_ctx.buf); + free(output_ctx.buf); +} diff --git a/Tests/UnitTest/generate_test_data.py b/Tests/UnitTest/generate_test_data.py index cbc154e4..fe3f0775 100755 --- a/Tests/UnitTest/generate_test_data.py +++ b/Tests/UnitTest/generate_test_data.py @@ -1724,6 +1724,22 @@ def load_testdata_sets(regenerate_input, regenerate_weights, regenerate_biases, generate_bias=False, int8_time_weights=True, interpreter=interpreter) + dataset = 'svdf_int8_2' + testdata_sets[dataset] = SVDFSettings(dataset, + type_of_test, + regenerate_weights, + regenerate_input, + regenerate_biases, + schema_file, + batches=2, + number_inputs=3, + rank=2, + memory_size=3, + input_size=40, + number_units=13, + input_zp=-12, + int8_time_weights=True, + interpreter=interpreter) type_of_test = 'add' dataset = 'add'