diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
index 573ee1a0..998a176c 100644
--- a/ARM.CMSIS-NN.pdsc
+++ b/ARM.CMSIS-NN.pdsc
@@ -99,6 +99,7 @@
+
diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h
index dbe13bf6..7541a1a8 100644
--- a/Include/arm_nnfunctions.h
+++ b/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
- * $Date: 13 January 2023
- * $Revision: V.11.3.0
+ * $Date: 5 September 2023
+ * $Revision: V.12.0.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
@@ -1032,7 +1032,10 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dim
* C_OUT : Output depth
* H & W : Not used.
* @param[in, out] output_data Output data pointer. Data type: int8
- * @return The function returns ARM_CMSIS_NN_SUCCESS
+ *
+ * @return The function returns either
+ * ARM_CMSIS_NN_ARG_ERROR
if argument constraints fail. or,
+ * ARM_CMSIS_NN_SUCCESS
on successful completion.
*
* @details
* - Supported framework: TensorFlow Lite
@@ -1049,8 +1052,24 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dims *output_dims,
int8_t *output_data);
+/**
+ * @brief Calculate vector sums that may be required by arm_fully_connected_s8().
+ * @param[in, out] vector_sum_buf Buffer for vector sums
+ * @param[in] vector_cols Number of vector columns
+ * @param[in] vector_rows Number of vector rows
+ * @param[in] vector_data Vector or weigths data
+ * @return The function returns
+ * ARM_CMSIS_NN_SUCCESS
- Successful operation
+ * ARM_CMSIS_NN_ARG_ERROR
- If not for Arm(R) Helium Architecture case.
+ */
+arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf,
+ const int32_t vector_cols,
+ const int32_t vector_rows,
+ const int8_t *vector_data);
+
/**
* @brief Get size of additional buffer required by arm_fully_connected_s8().
+ * See also arm_vector_sum_s8, which is required if buffer size is > 0.
* @param[in] filter_dims dimension of filter
* @return The function returns required buffer size in bytes
*
@@ -1851,6 +1870,15 @@ void arm_concatenation_s8_w(const int8_t *input,
/**
* @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
*
+ * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
+ * definition file to see if an additional buffer is required.
+ * Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
+ * size if an additional buffer is required.
+ * The caller is expected to clear the buffer ,if applicable, for security reasons.
+
+ * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
+ * arm_fully_connected_s8_get_buffer_size will return the buffer_size if required.
+ * The caller is expected to clear the buffer ,if applicable, for security reasons.
* @param[in] input_ctx Temporary scratch buffer
* The caller is expected to clear the buffer ,if applicable, for security reasons.
* @param[in] output_ctx Temporary output scratch buffer
@@ -1873,12 +1901,15 @@ void arm_concatenation_s8_w(const int8_t *input,
* @param[in] output_dims Output tensor dimensions
* @param[out] output_data Pointer to the output tensor
*
- * @return The function returns ARM_CMSIS_NN_SUCCESS
+ * @return The function returns either
+ * ARM_CMSIS_NN_ARG_ERROR
if argument constraints fail. or,
+ * ARM_CMSIS_NN_SUCCESS
on successful completion.
*
* @details
* 1. Supported framework: TensorFlow Lite micro
*/
-arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
+arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx,
+ const cmsis_nn_context *input_ctx,
const cmsis_nn_context *output_ctx,
const cmsis_nn_svdf_params *svdf_params,
const cmsis_nn_per_tensor_quant_params *input_quant_params,
@@ -2012,6 +2043,34 @@ arm_cmsis_nn_status arm_lstm_unidirectional_s16_s8(cmsis_nn_lstm_context *scratc
int16_t *cell_state,
int8_t *output_data);
+/**
+ * @brief Get size of additional buffer required by arm_svdf_s8().
+ * @param[in] filter_dims dimension of filter
+ * @return The function returns required buffer size in bytes
+ *
+ */
+int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
+
+/**
+ * @brief Get size of additional buffer required by arm_svdf_s8() for processors with DSP extension.
+ * Refer to arm_svdf_s8_get_buffer_size() for function argument details.
+ *
+ * @note Intended for compilation on Host. If compiling for an Arm target, use
+ * arm_svdf_s8_get_buffer_size().
+ *
+ */
+int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
+
+/**
+ * @brief Get size of additional buffer required by arm_svdf_s8() for Arm(R) Helium Architecture case.
+ * Refer to arm_svdf_s8_get_buffer_size() for function argument details.
+ *
+ * @note Intended for compilation on Host. If compiling for an Arm target, use
+ * arm_svdf_s8_get_buffer_size().
+ *
+ */
+int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
+
#ifdef __cplusplus
}
#endif
diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
index 09f5bc84..04feada6 100644
--- a/Include/arm_nnsupportfunctions.h
+++ b/Include/arm_nnsupportfunctions.h
@@ -394,6 +394,7 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const int8_t *lhs,
*/
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
const int8_t *rhs,
+ const int32_t *kernel_sum,
const int32_t *bias,
int8_t *dst,
const int32_t lhs_offset,
diff --git a/README.md b/README.md
index 97d0afb1..e713f1f6 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,6 @@ processors here are Cortex-M4 or a Cortex-M33 configured with optional DSP exten
Processors with Arm Helium Technology use the Arm M-profile Vector Extension(MVE) instructions for optimization.
Examples are Cortex-M55 or Cortex-M85 configured with MVE.
-
| Operator | C
int8 | C
int16 | DSP
int8 | DSP
int16 | MVE
int8 | MVE
int16 |
| --------------- | ----------- | ---------- | ----------- | ------------ | ----------- | ------------ |
| Conv2D | Yes | Yes | Yes | Yes | Yes | Yes |
diff --git a/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c b/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c
index 07618742..34b2abff 100644
--- a/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c
+++ b/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c
@@ -21,8 +21,8 @@
* Title: arm_fully_connected_get_buffer_sizes_s8.c
* Description: Collection of get buffer size functions for fully connected s8 layer function.
*
- * $Date: 31 January 2023
- * $Revision: V.1.0.0
+ * $Date: 15 August 2023
+ * $Revision: V.1.1.0
*
* Target : Arm(R) M-Profile Architecture
*
@@ -39,20 +39,24 @@
* @{
*/
-int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
+int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims)
{
(void)filter_dims;
return 0;
}
-int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims)
+int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims)
{
- return arm_fully_connected_s8_get_buffer_size(filter_dims);
+ return filter_dims->c * sizeof(int32_t);
}
-int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims)
+int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
{
- return arm_fully_connected_s8_get_buffer_size(filter_dims);
+#if defined(ARM_MATH_MVEI)
+ return arm_fully_connected_s8_get_buffer_size_mve(filter_dims);
+#else
+ return arm_fully_connected_s8_get_buffer_size_dsp(filter_dims);
+#endif
}
/**
diff --git a/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
index 55550c01..77dbe1ff 100644
--- a/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
+++ b/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
@@ -60,15 +60,24 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
int8_t *output)
{
(void)bias_dims;
- (void)ctx;
(void)fc_params->filter_offset;
int32_t batch_cnt = input_dims->n;
+#if defined(ARM_MATH_MVEI)
+ if (ctx->buf == NULL)
+ {
+ return (ARM_CMSIS_NN_ARG_ERROR);
+ }
+#endif
+
+ const int32_t *kernel_sum = ctx->buf;
+
while (batch_cnt)
{
arm_nn_vec_mat_mult_t_s8(input,
kernel,
+ kernel_sum,
bias,
output,
fc_params->input_offset,
@@ -80,6 +89,7 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
fc_params->activation.min,
fc_params->activation.max,
1L);
+
input += filter_dims->n;
output += output_dims->c;
batch_cnt--;
diff --git a/Source/FullyConnectedFunctions/arm_vector_sum_s8.c b/Source/FullyConnectedFunctions/arm_vector_sum_s8.c
new file mode 100644
index 00000000..0120ba1d
--- /dev/null
+++ b/Source/FullyConnectedFunctions/arm_vector_sum_s8.c
@@ -0,0 +1,144 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_vector_sum_s8
+ * Description: Generic function for calculating vector sums
+ *
+ * $Date: 5 September 2023
+ * $Revision: V.1.0.0
+ *
+ * Target : Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup Public
+ */
+
+/**
+ * @addtogroup FC
+ * @{
+ */
+
+/*
+ * S8 vector sum fuction in preparation for e.g. kernel sums in fully connected and matrix multiplication layer function
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf,
+ const int32_t vector_cols,
+ const int32_t vector_rows,
+ const int8_t *vector_data)
+{
+#if defined(ARM_MATH_MVEI)
+ const int32_t row_loop_cnt = vector_rows / 4;
+
+ for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
+ {
+ const int32_t col_loop_cnt = (vector_cols + 15) / 16;
+
+ const int8_t *vector_0 = vector_data;
+ const int8_t *vector_1 = vector_data + vector_cols;
+ const int8_t *vector_2 = vector_data + 2 * vector_cols;
+ const int8_t *vector_3 = vector_data + 3 * vector_cols;
+
+ int32_t vector_sum_0 = 0;
+ int32_t vector_sum_1 = 0;
+ int32_t vector_sum_2 = 0;
+ int32_t vector_sum_3 = 0;
+
+ uint32_t col_cnt = (uint32_t)vector_cols;
+
+ for (int i = 0; i < col_loop_cnt; i++)
+ {
+ mve_pred16_t p = vctp8q(col_cnt);
+ col_cnt -= 16;
+
+ const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p);
+ vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0);
+
+ const int8x16_t ker_1 = vldrbq_z_s8(vector_1, p);
+ vector_sum_1 = vaddvaq_s8(vector_sum_1, ker_1);
+
+ const int8x16_t ker_2 = vldrbq_z_s8(vector_2, p);
+ vector_sum_2 = vaddvaq_s8(vector_sum_2, ker_2);
+
+ const int8x16_t ker_3 = vldrbq_z_s8(vector_3, p);
+ vector_sum_3 = vaddvaq_s8(vector_sum_3, ker_3);
+
+ vector_0 += 16;
+ vector_1 += 16;
+ vector_2 += 16;
+ vector_3 += 16;
+ }
+ vector_data += 4 * vector_cols;
+
+ vector_sum_buf[0] = vector_sum_0;
+ vector_sum_buf[1] = vector_sum_1;
+ vector_sum_buf[2] = vector_sum_2;
+ vector_sum_buf[3] = vector_sum_3;
+ vector_sum_buf += 4;
+ }
+
+ const int32_t loop_cnt = vector_rows % 4;
+
+ for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++)
+ {
+ const int32_t col_loop_cnt = (vector_cols + 15) / 16;
+
+ const int8_t *vector_0 = vector_data;
+
+ int32_t vector_sum_0 = 0;
+
+ uint32_t col_cnt = (uint32_t)vector_cols;
+
+ for (int i = 0; i < col_loop_cnt; i++)
+ {
+ mve_pred16_t p = vctp8q(col_cnt);
+ col_cnt -= 16;
+
+ const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p);
+ vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0);
+
+ vector_0 += 16;
+ }
+ vector_data += vector_cols;
+
+ vector_sum_buf[i_row_loop_cnt] = vector_sum_0;
+ }
+
+ return (ARM_CMSIS_NN_SUCCESS);
+#else
+ (void)vector_sum_buf;
+ (void)vector_rows;
+ (void)vector_cols;
+ (void)vector_data;
+
+ return (ARM_CMSIS_NN_NO_IMPL_ERROR);
+#endif
+}
+
+/**
+ * @} end of FC group
+ */
diff --git a/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
index 8568676c..1287d00a 100644
--- a/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
+++ b/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
@@ -57,6 +57,7 @@
#endif
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
const int8_t *rhs,
+ const int32_t *kernel_sum,
const int32_t *bias,
int8_t *dst,
const int32_t lhs_offset,
@@ -70,7 +71,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
const int32_t address_offset)
{
#if defined(ARM_MATH_MVEI)
- const int32_t row_loop_cnt = rhs_rows / 3;
+ const int32_t row_loop_cnt = rhs_rows / 4;
const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3};
for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
@@ -78,6 +79,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
int32_t acc_0 = 0;
int32_t acc_1 = 0;
int32_t acc_2 = 0;
+ int32_t acc_3 = 0;
const int32_t col_loop_cnt = (rhs_cols + 15) / 16;
@@ -85,15 +87,14 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
const int8_t *rhs_0 = rhs;
const int8_t *rhs_1 = rhs + rhs_cols;
const int8_t *rhs_2 = rhs + 2 * rhs_cols;
+ const int8_t *rhs_3 = rhs + 3 * rhs_cols;
- int32_t rhs_sum_0 = 0;
- int32_t rhs_sum_1 = 0;
- int32_t rhs_sum_2 = 0;
if (bias)
{
acc_0 = *bias++;
acc_1 = *bias++;
acc_2 = *bias++;
+ acc_3 = *bias++;
}
uint32_t col_cnt = (uint32_t)rhs_cols;
@@ -106,53 +107,48 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
- rhs_sum_0 = vaddvaq_s8(rhs_sum_0, ker_0);
acc_0 = vmladavaq_s8(acc_0, ker_0, input);
const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p);
- rhs_sum_1 = vaddvaq_s8(rhs_sum_1, ker_1);
acc_1 = vmladavaq_s8(acc_1, ker_1, input);
const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p);
- rhs_sum_2 = vaddvaq_s8(rhs_sum_2, ker_2);
acc_2 = vmladavaq_s8(acc_2, ker_2, input);
+ const int8x16_t ker_3 = vldrbq_z_s8(rhs_3, p);
+ acc_3 = vmladavaq_s8(acc_3, ker_3, input);
+
lhs_vec += 16;
rhs_0 += 16;
rhs_1 += 16;
rhs_2 += 16;
+ rhs_3 += 16;
}
- rhs += 3 * rhs_cols;
+ rhs += 4 * rhs_cols;
- int32x4_t acc = {acc_0, acc_1, acc_2, 0};
- const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};
+ int32x4_t acc = {acc_0, acc_1, acc_2, acc_3};
+
+ const int32x4_t rhs_sum = {kernel_sum[0], kernel_sum[1], kernel_sum[2], kernel_sum[3]};
acc += vdupq_n_s32(lhs_offset) * rhs_sum;
+ kernel_sum += 4;
acc = arm_requantize_mve(acc, dst_multiplier, dst_shift);
acc = vaddq_s32(acc, vdupq_n_s32(dst_offset));
acc = vmaxq_s32(acc, vdupq_n_s32(activation_min));
acc = vminq_s32(acc, vdupq_n_s32(activation_max));
- const mve_pred16_t p = vctp32q(3);
- if (address_offset > 1L)
- {
- vstrbq_scatter_offset_p_s32(dst, address_offset_array, acc, p);
- }
- else
- {
- vstrbq_p_s32(dst, acc, p);
- }
- dst += 3 * address_offset;
+ vstrbq_scatter_offset_s32(dst, address_offset_array, acc);
+
+ dst += 4 * address_offset;
}
- const int loop_cnt = rhs_rows % 3;
+ const int loop_cnt = rhs_rows % 4;
for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++)
{
int32_t acc_0 = 0;
const int32_t col_loop_cnt = (rhs_cols + 15) / 16;
const int8_t *lhs_vec = lhs;
const int8_t *rhs_0 = rhs;
- int32_t rhs_sum_0 = 0;
uint32_t col_cnt = (uint32_t)rhs_cols;
for (int i = 0; i < col_loop_cnt; i++)
@@ -162,7 +158,6 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
- rhs_sum_0 = vaddvaq_s8(rhs_sum_0, ker_0);
acc_0 = vmladavaq_s8(acc_0, ker_0, input);
lhs_vec += 16;
@@ -175,7 +170,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
acc_0 += *bias;
bias++;
}
- const int32_t offsets = rhs_sum_0 * lhs_offset;
+ const int32_t rhs_sum = kernel_sum[i_row_loop_cnt];
+ const int32_t offsets = rhs_sum * lhs_offset;
acc_0 += offsets;
acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
acc_0 += dst_offset;
@@ -187,6 +183,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
}
#elif defined(ARM_MATH_DSP)
+ (void)kernel_sum;
+
const int32_t row_loop_cnt = rhs_rows / 2;
const int16_t lhs_offset_s16 = (int16_t)lhs_offset;
const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
@@ -302,6 +300,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
}
#else
+ (void)kernel_sum;
const int32_t row_loop_cnt = rhs_rows / 3;
diff --git a/Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c b/Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c
new file mode 100644
index 00000000..44b43757
--- /dev/null
+++ b/Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c
@@ -0,0 +1,64 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_svdf_get_buffer_sizes_s8.c
+ * Description: Collection of get buffer size functions for svdf s8 layer function.
+ *
+ * $Date: 5 September 2023
+ * $Revision: V.1.0.0
+ *
+ * Target : Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+
+/**
+ * @ingroup SVDF
+ */
+
+/**
+ * @addtogroup GetBufferSizeSVDF
+ * @{
+ */
+
+int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *weights_feature_dims)
+{
+ (void)weights_feature_dims;
+ return 0;
+}
+
+int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *weights_feature_dims)
+{
+ return weights_feature_dims->n * sizeof(int32_t);
+}
+
+int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *weights_feature_dims)
+{
+#if defined(ARM_MATH_MVEI)
+ return arm_svdf_s8_get_buffer_size_mve(weights_feature_dims);
+#else
+ return arm_svdf_s8_get_buffer_size_dsp(weights_feature_dims);
+#endif
+}
+
+/**
+ * @} end of GetBufferSizeSVDF group
+ */
diff --git a/Source/SVDFunctions/arm_svdf_s8.c b/Source/SVDFunctions/arm_svdf_s8.c
index faab5a49..a97ce3ae 100644
--- a/Source/SVDFunctions/arm_svdf_s8.c
+++ b/Source/SVDFunctions/arm_svdf_s8.c
@@ -21,8 +21,8 @@
* Title: arm_svdf_s8.c
* Description: S8 basic SVDF layer function
*
- * $Date: 5 January 2023
- * $Revision: V.5.1.0
+ * $Date: 5 September 2023
+ * $Revision: V.6.0.0
*
* Target : Arm(R) M-Profile Architecture
*
@@ -47,7 +47,8 @@
*
*/
-arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
+arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx,
+ const cmsis_nn_context *input_ctx,
const cmsis_nn_context *output_ctx,
const cmsis_nn_svdf_params *svdf_params,
const cmsis_nn_per_tensor_quant_params *input_quant_params,
@@ -69,6 +70,13 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
(void)state_dims;
(void)output_dims;
+#if defined(ARM_MATH_MVEI)
+ if (ctx->buf == NULL)
+ {
+ return (ARM_CMSIS_NN_ARG_ERROR);
+ }
+#endif
+
const int32_t multiplier_in = input_quant_params->multiplier;
const int32_t shift_in = input_quant_params->shift;
const int32_t multiplier_out = output_quant_params->multiplier;
@@ -99,6 +107,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
}
int32_t *buffer_b = (int32_t *)output_ctx->buf;
+ int32_t *kernel_sum_data = (int32_t *)ctx->buf;
+
// Left shift state
memmove((int8_t *)state_data,
(int8_t *)state_data + 1,
@@ -108,11 +118,11 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
for (int i_batch = 0; i_batch < input_batches; i_batch++)
{
int8_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
- const int8_t *weight = weights_feature_data;
const int8_t *input = input_data + i_batch * input_height;
arm_cmsis_nn_status res = arm_nn_vec_mat_mult_t_s8(input,
- weight,
+ weights_feature_data,
+ kernel_sum_data,
NULL,
res_ptr,
-zp_in,
diff --git a/Tests/UnitTest/TestCases/TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h b/Tests/UnitTest/TestCases/TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h
new file mode 100644
index 00000000..917f9f58
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include
+
+// This is the output of arm_vector_sum_s8() from the ds_cnn_s model.
+int32_t ds_cnn_s_layer_12_fully_connected_kernel_sums[12] =
+ {-2931, 80, -521, -390, -576, -255, -464, -470, -486, -502, -490, -234};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/biases_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/biases_data.h
new file mode 100644
index 00000000..ce036577
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/biases_data.h
@@ -0,0 +1,6 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include
+
+const int32_t svdf_int8_2_biases[13] = {-108, -78, -29, -5, 25, -113, 122, -68, -32, -57, -59, -14, 13};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/config_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/config_data.h
new file mode 100644
index 00000000..b9491339
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/config_data.h
@@ -0,0 +1,19 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#define SVDF_INT8_2_MULTIPLIER_IN 1717987072
+#define SVDF_INT8_2_MULTIPLIER_OUT 1099511552
+#define SVDF_INT8_2_SHIFT_1 -3
+#define SVDF_INT8_2_SHIFT_2 -11
+#define SVDF_INT8_2_IN_ACTIVATION_MIN -128
+#define SVDF_INT8_2_IN_ACTIVATION_MAX 127
+#define SVDF_INT8_2_RANK 2
+#define SVDF_INT8_2_FEATURE_BATCHES 26
+#define SVDF_INT8_2_TIME_BATCHES 3
+#define SVDF_INT8_2_INPUT_SIZE 40
+#define SVDF_INT8_2_DST_SIZE 26
+#define SVDF_INT8_2_OUT_ACTIVATION_MIN -128
+#define SVDF_INT8_2_OUT_ACTIVATION_MAX 127
+#define SVDF_INT8_2_INPUT_BATCHES 2
+#define SVDF_INT8_2_INPUT_OFFSET -12
+#define SVDF_INT8_2_OUTPUT_OFFSET 0
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/input_sequence_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/input_sequence_data.h
new file mode 100644
index 00000000..92b96632
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/input_sequence_data.h
@@ -0,0 +1,18 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include
+
+const int8_t svdf_int8_2_input_sequence[240] = {
+ -5, -114, -7, -41, 22, -57, 100, -105, -75, -113, 55, -61, -87, -46, -6, 28, -105, 87, 95, -72,
+ -85, -49, -71, 67, -29, -99, 22, -47, -78, -92, -114, -85, -10, -9, 26, 62, -6, -106, -96, -117,
+ -90, 16, -90, -15, 63, -3, 63, 24, 119, -25, -19, 38, 119, 12, -25, -32, 125, 3, -127, 40,
+ -85, 27, 80, -73, -9, -14, -95, -64, 15, -99, 88, 39, 32, -7, -18, 38, -93, -81, -65, 108,
+ -33, -31, -3, -104, 103, 108, 116, -33, 122, -128, 90, -28, 116, 93, -107, 122, 118, -10, -124, -43,
+ -1, 12, -48, 117, -54, -42, 57, 106, -48, -12, 90, 121, -52, 17, -73, -127, 34, -72, 72, -97,
+ 2, 75, -82, 45, -3, -42, 82, -71, -109, -75, -101, 70, -128, -128, -79, -96, 61, 59, -4, -95,
+ -92, 77, -7, 67, 99, 110, 49, -102, -17, -103, -13, -6, -118, -23, -33, -68, -25, -50, 68, -68,
+ 57, 120, 74, 50, 97, 36, -114, -120, -73, -70, -23, -10, 81, 22, 74, 43, -112, 35, 122, -65,
+ -1, 95, 69, -27, 84, 99, -67, -71, -119, 89, 43, -107, -74, 105, -48, -25, 86, 109, -92, -29,
+ 80, 44, -61, 15, 119, 109, -28, -67, 84, -76, 65, 95, 60, -110, -97, 32, -52, -127, -46, 78,
+ 42, 116, -113, -26, 96, 27, 93, -70, -41, -11, 34, 53, -39, 126, -65, -28, -83, 4, 14, -98};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/output_ref_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/output_ref_data.h
new file mode 100644
index 00000000..87d60d88
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/output_ref_data.h
@@ -0,0 +1,7 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include
+
+const int8_t svdf_int8_2_output_ref[26] = {5, -9, 0, 7, 2, -3, 5, 0, 2, -10, 1, -5, -1,
+ -5, -9, 0, 1, 3, -2, 9, -3, 5, 4, 4, 7, 9};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/state_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/state_data.h
new file mode 100644
index 00000000..8653560e
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/state_data.h
@@ -0,0 +1,11 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include
+
+const int8_t svdf_int8_2_state[156] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/test_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/test_data.h
new file mode 100644
index 00000000..0d467e1d
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/test_data.h
@@ -0,0 +1,9 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_sequence_data.h"
+#include "output_ref_data.h"
+#include "state_data.h"
+#include "weights_feature_data.h"
+#include "weights_time_data.h"
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_feature_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_feature_data.h
new file mode 100644
index 00000000..b91c1b1f
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_feature_data.h
@@ -0,0 +1,61 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include
+
+const int8_t svdf_int8_2_weights_feature[1040] = {
+ -94, -100, -91, -31, 60, -96, -93, 84, 16, -64, -79, -109, 48, 65, 118, 117, -110, -19, 29,
+ 83, 21, 27, -85, -40, -16, 31, 52, 79, 91, 14, -62, 100, 30, 6, -83, -44, -75, -44,
+ -45, -108, 7, 52, 15, -101, -30, -106, -62, 20, -51, 102, 74, -75, 54, -112, -1, -99, 122,
+ -113, -33, -29, 84, 102, -125, -31, -66, -34, -61, -80, 3, 66, -84, -112, -116, 19, 88, -113,
+ -37, -3, -82, 78, 22, 105, -97, 117, -79, 99, 78, -61, 68, 106, 99, 12, 20, 68, 16,
+ -26, -2, 104, 108, 115, -59, -78, -32, 78, -51, -13, 51, -81, -7, 74, 55, -13, -71, 48,
+ -14, -37, 43, -27, -115, -75, -22, 92, 99, 11, -31, -23, 24, 47, 41, -29, -71, -78, -92,
+ 0, 13, 36, -32, -47, 15, 37, -37, -76, 68, -105, 28, 91, 107, 123, -40, 123, 96, -43,
+ 13, -113, 55, -124, -31, -25, -118, 92, 66, -13, 43, 65, -108, 93, 47, 71, -57, -85, 3,
+ -95, 100, 18, 60, -92, 43, 79, -94, 106, -26, -66, -72, -74, -8, 47, -57, -49, 13, -69,
+ 28, 62, 90, 18, -52, 6, -70, 22, 109, -114, 18, -32, -108, 16, -31, 101, -103, 106, 122,
+ 115, -94, 35, -31, -105, -26, -107, 58, -19, 99, 124, 27, -16, -10, 48, 18, 65, -30, 56,
+ 119, -40, -107, -27, -83, 107, 117, 56, -30, -19, 68, 117, 29, -95, -95, 76, 65, 19, 105,
+ -51, 87, -62, 91, -84, -113, 106, -116, 57, 67, 71, -23, -79, -61, -56, -100, -86, -74, 114,
+ -50, -110, 107, -70, -47, -65, 56, 90, -86, 8, 1, -39, 41, 22, 24, -87, -104, 43, -17,
+ -102, -114, 87, -117, -106, -101, 4, 78, -64, 115, 88, 70, -66, -42, -46, 80, -45, -78, -43,
+ 113, -31, 58, -89, 49, -83, -111, -126, 126, 100, -87, -13, 122, -94, -90, -91, -13, 5, 105,
+ -75, 25, -12, -3, 101, -72, -55, 54, 112, 49, 0, 92, -114, -75, -16, 20, -24, 29, -34,
+ 114, 101, -69, -92, 68, -25, 95, 117, 87, 71, 106, 42, -108, -118, 21, 112, -10, 43, -71,
+ -65, 112, 61, 40, 2, 76, 21, 32, 7, 26, -101, 70, 83, 29, 44, -89, 44, 58, 104,
+ 54, 111, 17, 42, 6, 40, 63, -61, 41, 103, 30, -52, 0, -118, -3, 0, -74, 62, 3,
+ 91, 90, 46, 88, 62, -97, 73, 55, -116, -104, 57, 114, -104, 71, -80, 61, -120, -18, 48,
+ -22, 24, 14, 110, 86, 60, -60, -80, -110, 101, -22, -38, -3, -30, 66, 28, -64, 101, -79,
+ 31, 39, -82, -58, 111, 33, 73, -21, 123, -104, -35, -111, -70, 24, 118, 48, -60, -11, -70,
+ 18, 27, 106, 56, 82, 44, -43, -13, 103, 84, -114, 56, -38, -4, 15, -90, -17, -23, -58,
+ -21, -112, 13, -118, -5, 32, -81, -78, 55, -26, 96, -66, -63, -94, -120, 66, 99, 15, -126,
+ 97, 49, 38, -22, 97, -36, -111, -106, 86, -75, 87, 89, -107, -56, -36, 126, 29, 40, 35,
+ 12, -122, -35, -42, 21, -99, -27, 5, 45, 56, -38, 37, -88, -96, -116, 109, 90, 101, 43,
+ 38, -123, 102, -109, -41, -22, -113, 90, 43, -12, -126, 27, 82, -76, -29, -80, 10, 21, 53,
+ 87, -3, 30, 99, -55, 32, -72, -5, -116, 2, -70, 122, 51, 50, 45, -14, -85, 89, 13,
+ 62, 108, 42, 22, -126, 117, -28, -127, -3, -26, 100, -63, -65, 17, 41, 6, 100, -93, 9,
+ 115, 12, 20, 31, 14, -9, 81, -81, -87, 37, -96, -72, 15, 52, 82, -121, -1, -26, -120,
+ -48, 121, -21, -107, -97, 44, 56, -48, -52, -105, 101, 53, 56, -11, 77, 124, 102, 67, -19,
+ 120, -67, -85, -2, 76, 24, 9, -124, -54, -62, -73, 110, -96, -64, 112, 10, 44, 78, -59,
+ -82, 111, 35, -39, 19, -25, -16, -28, -39, -108, 93, -31, 98, -88, 33, 82, 62, 112, -121,
+ -109, 104, 66, 93, -46, 11, 52, 31, -58, -70, -93, 99, -83, -56, 118, -109, 5, 25, 111,
+ -30, -100, -69, -22, -26, 97, -78, 76, 21, 24, 83, -77, 80, -42, 111, -10, -113, 104, 112,
+ -77, 92, -82, 106, 125, 17, -81, -1, -86, -102, 1, 63, -46, -105, 84, 55, 22, -53, -18,
+ 110, 15, 35, 5, -40, 81, 71, 100, 93, -3, 81, -53, 66, -57, 56, -40, -49, -42, 20,
+ -100, 43, 93, -13, 89, -121, -101, 113, 20, 74, -86, -77, 93, -109, 6, 15, 91, -75, -66,
+ 31, 28, 62, 41, -95, -119, -4, -104, 34, 2, -26, -118, 22, -62, -102, 19, -73, 29, -126,
+ -79, -96, 52, 97, 83, -11, -23, 61, 60, -77, 7, 71, -36, 106, -93, -1, -126, -64, 35,
+ -124, 37, 90, -59, 6, 2, -2, 2, 7, 11, -83, -117, -67, 95, -25, 43, 105, -20, -9,
+ -38, 84, 108, -55, -115, -46, -68, 112, -93, 109, -61, -32, -62, 41, -37, 28, -13, 8, -51,
+ 114, 43, 39, 39, -37, 17, 53, 126, 41, 79, 99, -80, -96, -15, 64, 85, -124, 94, -6,
+ 19, -54, -83, -66, 78, 54, 16, -108, 69, -65, -74, -6, 120, -121, -38, -85, -12, 5, 20,
+ -91, 33, -63, 90, -84, -21, 111, 110, 98, -49, -125, -74, 48, -52, -98, 102, 64, -87, -39,
+ 16, 75, 82, 40, -39, -96, 8, -92, 101, 82, 38, -46, -83, -76, 99, -79, 28, 100, 76,
+ 123, 81, -61, 39, 94, -87, 121, 112, 59, -13, 48, 21, -17, -17, -81, 91, -52, 57, -117,
+ -100, 99, 92, 95, 8, -83, -68, -62, -21, 100, -4, -23, -42, 36, -94, 58, 5, 35, -70,
+ 45, 126, 112, 75, 110, -73, 84, 70, -108, -83, -36, -11, -110, -79, -34, -18, 11, -84, 53,
+ 84, 10, 49, -17, -62, -72, 103, 122, -66, 102, -7, 71, 5, -78, -49, 91, 39, -98, -74,
+ 71, 74, -12, 17, 1, -98, -28, 11, 12, -91, -69, 15, 3, -104, 109, 12, 48, -64, -56,
+ -5, -33, -74, -33, -8, -63, 88, -126, -61, 70, -98, -12, 15, 19, 29, -107, 107, -8, -24,
+ 55, 26, 119, -43, 87, 4, 48, -11, -126, -40, 107, 122, -92, -27};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_time_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_time_data.h
new file mode 100644
index 00000000..e3eff344
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_time_data.h
@@ -0,0 +1,10 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include
+
+const int8_t svdf_int8_2_weights_time[78] = {
+ 98, 57, 10, 25, 18, -46, 8, -55, -128, 57, 49, -13, 3, -5, 101, -128, -13, 45, 67, 13,
+ -49, 111, 73, 48, -113, -17, -83, -49, 71, 4, -74, 86, 54, -29, 6, -20, 94, 124, 113, -69,
+ 34, 21, 110, 71, 68, 8, 80, 38, -94, 82, -20, 4, 10, -124, 86, 110, -33, -3, 12, -99,
+ -116, -107, 74, 45, -44, 51, 114, 78, -116, 82, -48, -100, -107, 84, 110, -118, 32, 105};
diff --git a/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c b/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c
index 9d26e64c..1e6eafd9 100644
--- a/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -469,6 +469,10 @@ void ds_cnn_l_s8_inference(void)
bias_dims.c = in_out_dim_1.c;
+#if defined(ARM_MATH_MVEI)
+ arm_vector_sum_s8(ctx.buf, conv_filter_dims.n, in_out_dim_1.c, ds_cnn_l_layer_14_fully_connected_weights);
+#endif
+
status |= arm_fully_connected_s8(&ctx,
&fc_params,
&per_tensor_quant_params,
diff --git a/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c b/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c
index 21fb972d..5862df7b 100644
--- a/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -19,6 +19,7 @@
#include "arm_nnfunctions.h"
#include "unity.h"
+#include "../TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h"
#include "../TestData/ds_cnn_s/test_data.h"
#include "../Utils/validate.h"
@@ -107,6 +108,7 @@ void ds_cnn_s_s8_inference(void)
/* Test for a complete int8 DS_CNN_S keyword spotting network from https://github.com/ARM-software/ML-zoo &
* Tag: 22.02 */
cmsis_nn_context ctx;
+ cmsis_nn_context ctx_kernel_sum;
const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
ctx.size = ds_cnn_s_s8_get_buffer_size();
@@ -408,7 +410,13 @@ void ds_cnn_s_s8_inference(void)
bias_dims.c = in_out_dim_1.c;
- status |= arm_fully_connected_s8(&ctx,
+#if defined(ARM_MATH_MVEI)
+ ctx_kernel_sum.buf = ds_cnn_s_layer_12_fully_connected_kernel_sums;
+#else
+ ctx_kernel_sum = ctx;
+#endif
+
+ status |= arm_fully_connected_s8(&ctx_kernel_sum,
&fc_params,
&per_tensor_quant_params,
&in_out_dim_0,
diff --git a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c
index 75393c34..00575a14 100644
--- a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -64,10 +64,15 @@ void fully_connected_arm_fully_connected_s8(void)
quant_params.multiplier = FULLY_CONNECTED_OUTPUT_MULTIPLIER;
quant_params.shift = FULLY_CONNECTED_OUTPUT_SHIFT;
- int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+ const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
ctx.buf = malloc(buf_size);
ctx.size = buf_size;
+#if defined(ARM_MATH_MVEI)
+ int32_t *buf = ctx.buf;
+ TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
&fc_params,
&quant_params,
@@ -122,9 +127,15 @@ void fully_connected_mve_0_arm_fully_connected_s8(void)
quant_params.multiplier = FULLY_CONNECTED_MVE_0_OUTPUT_MULTIPLIER;
quant_params.shift = FULLY_CONNECTED_MVE_0_OUTPUT_SHIFT;
- int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+ const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
ctx.buf = malloc(buf_size);
ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+ int32_t *buf = ctx.buf;
+ TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
&fc_params,
&quant_params,
@@ -178,9 +189,15 @@ void fully_connected_mve_1_arm_fully_connected_s8(void)
quant_params.multiplier = FULLY_CONNECTED_MVE_1_OUTPUT_MULTIPLIER;
quant_params.shift = FULLY_CONNECTED_MVE_1_OUTPUT_SHIFT;
- int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+ const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
ctx.buf = malloc(buf_size);
ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+ int32_t *buf = ctx.buf;
+ TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
&fc_params,
&quant_params,
@@ -245,9 +262,15 @@ void fully_connected_null_bias_0_arm_fully_connected_s8(void)
}
TEST_ASSERT_EQUAL(expected, ip_check);
- int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+ const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
ctx.buf = malloc(buf_size);
ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+ int32_t *buf = ctx.buf;
+ TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
&fc_params,
&quant_params,
@@ -301,9 +324,15 @@ void fully_connected_out_activation_arm_fully_connected_s8(void)
quant_params.multiplier = FULLY_CONNECTED_OUT_ACTIVATION_OUTPUT_MULTIPLIER;
quant_params.shift = FULLY_CONNECTED_OUT_ACTIVATION_OUTPUT_SHIFT;
- int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+ const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
ctx.buf = malloc(buf_size);
ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+ int32_t *buf = ctx.buf;
+ TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
&fc_params,
&quant_params,
diff --git a/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c b/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
index ff493a4f..6db62bf9 100644
--- a/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
@@ -45,3 +45,4 @@ void setUp(void)
void tearDown(void) {}
void test_svdf_int8_arm_s8(void) { svdf_int8_arm_svdf_s8(); }
+void test_svdf_int8_2_arm_s8(void) { svdf_int8_2_arm_svdf_s8(); }
diff --git a/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c b/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
index b71d5a5d..43f7d26e 100644
--- a/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
@@ -20,6 +20,7 @@
#include
#include "../TestData/svdf_int8/test_data.h"
+#include "../TestData/svdf_int8_2/test_data.h"
#include "../Utils/validate.h"
#define REPEAT_NUM (1)
@@ -68,6 +69,16 @@ void svdf_int8_arm_svdf_s8(void)
const int scratch_size = SVDF_INT8_INPUT_BATCHES * SVDF_INT8_FEATURE_BATCHES * sizeof(int32_t);
const int scratch_size_out = SVDF_INT8_INPUT_BATCHES * number_units * sizeof(int32_t);
+ cmsis_nn_context ctx;
+ const int32_t buf_size = arm_svdf_s8_get_buffer_size(&weights_feature_dims);
+ ctx.buf = malloc(buf_size);
+ ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+ int32_t *kernel_sum_buf = ctx.buf;
+ arm_vector_sum_s8(kernel_sum_buf, input_dims.h, weights_feature_dims.n, weights_feature_data);
+#endif
+
// + SVDF_INT8_TIME_BATCHES additional bytes to make sure it is not overwritten
const int state_data_size = sizeof(svdf_int8_state) + SVDF_INT8_TIME_BATCHES;
const int8_t initial_data = 66;
@@ -86,7 +97,8 @@ void svdf_int8_arm_svdf_s8(void)
for (int j = 0; j < number_inputs; j++)
{
memcpy(input_data, svdf_int8_input_sequence + j * input_round_size, input_round_size);
- arm_cmsis_nn_status result = arm_svdf_s8(&input_ctx,
+ arm_cmsis_nn_status result = arm_svdf_s8(&ctx,
+ &input_ctx,
&output_ctx,
&svdf_int8_params,
&input_quant_params,
@@ -109,6 +121,13 @@ void svdf_int8_arm_svdf_s8(void)
TEST_ASSERT_TRUE(validate(output_data, output_ref, output_ref_size));
}
+ if (ctx.buf)
+ {
+ // The caller is responsible to clear the scratch buffers for security reasons if applicable.
+ memset(ctx.buf, 0, buf_size);
+ free(ctx.buf);
+ }
+
// Make sure state data is not written outside boundary
for (int i = sizeof(svdf_int8_state); i < state_data_size; i++)
{
@@ -120,3 +139,108 @@ void svdf_int8_arm_svdf_s8(void)
free(input_ctx.buf);
free(output_ctx.buf);
}
+
+void svdf_int8_2_arm_svdf_s8(void)
+{
+ const int32_t output_ref_size = SVDF_INT8_2_DST_SIZE;
+ const int8_t *output_ref = svdf_int8_2_output_ref;
+ const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+ cmsis_nn_context input_ctx;
+ cmsis_nn_context output_ctx;
+ cmsis_nn_svdf_params svdf_int8_2_params;
+ cmsis_nn_dims input_dims;
+ cmsis_nn_dims weights_feature_dims;
+ cmsis_nn_dims weights_time_dims;
+ cmsis_nn_dims state_dims;
+ cmsis_nn_dims output_dims;
+ cmsis_nn_dims bias_dims;
+ cmsis_nn_per_tensor_quant_params input_quant_params;
+ cmsis_nn_per_tensor_quant_params output_quant_params;
+ int8_t output_data[SVDF_INT8_2_DST_SIZE] = {1};
+ const int8_t *weights_feature_data = svdf_int8_2_weights_feature;
+ const int8_t *weights_time_data = svdf_int8_2_weights_time;
+
+ input_dims.n = SVDF_INT8_2_INPUT_BATCHES;
+ input_dims.h = SVDF_INT8_2_INPUT_SIZE;
+ weights_feature_dims.n = SVDF_INT8_2_FEATURE_BATCHES;
+ weights_time_dims.h = SVDF_INT8_2_TIME_BATCHES;
+
+ input_quant_params.multiplier = SVDF_INT8_2_MULTIPLIER_IN;
+ input_quant_params.shift = SVDF_INT8_2_SHIFT_1;
+ output_quant_params.multiplier = SVDF_INT8_2_MULTIPLIER_OUT;
+ output_quant_params.shift = SVDF_INT8_2_SHIFT_2;
+
+ svdf_int8_2_params.input_activation.min = SVDF_INT8_2_IN_ACTIVATION_MIN;
+ svdf_int8_2_params.input_activation.max = SVDF_INT8_2_IN_ACTIVATION_MAX;
+ svdf_int8_2_params.output_activation.min = SVDF_INT8_2_OUT_ACTIVATION_MIN;
+ svdf_int8_2_params.output_activation.max = SVDF_INT8_2_OUT_ACTIVATION_MAX;
+ svdf_int8_2_params.input_offset = SVDF_INT8_2_INPUT_OFFSET;
+ svdf_int8_2_params.output_offset = SVDF_INT8_2_OUTPUT_OFFSET;
+ svdf_int8_2_params.rank = SVDF_INT8_2_RANK;
+
+ const int input_round_size = SVDF_INT8_2_INPUT_BATCHES * SVDF_INT8_2_INPUT_SIZE;
+ const int number_inputs = sizeof(svdf_int8_2_input_sequence) / input_round_size;
+ const int32_t number_units = SVDF_INT8_2_FEATURE_BATCHES / SVDF_INT8_2_RANK;
+ const int scratch_size = SVDF_INT8_2_INPUT_BATCHES * SVDF_INT8_2_FEATURE_BATCHES * sizeof(int32_t);
+ const int scratch_size_out = SVDF_INT8_2_INPUT_BATCHES * number_units * sizeof(int32_t);
+
+ cmsis_nn_context ctx;
+ const int32_t buf_size = arm_svdf_s8_get_buffer_size(&weights_feature_dims);
+ ctx.buf = malloc(buf_size);
+ ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+ int32_t *kernel_sum_buf = ctx.buf;
+ arm_vector_sum_s8(kernel_sum_buf, input_dims.h, weights_feature_dims.n, weights_feature_data);
+#endif
+
+ const int state_data_size = sizeof(svdf_int8_2_state);
+
+ input_ctx.buf = malloc(scratch_size);
+ output_ctx.buf = malloc(scratch_size_out);
+
+ int8_t *input_data = malloc(input_round_size);
+ int8_t *state_data = malloc(state_data_size);
+
+ for (int i = 0; i < REPEAT_NUM; i++)
+ {
+ memcpy(state_data, svdf_int8_2_state, sizeof(svdf_int8_2_state));
+ for (int j = 0; j < number_inputs; j++)
+ {
+ memcpy(input_data, svdf_int8_2_input_sequence + j * input_round_size, input_round_size);
+ arm_cmsis_nn_status result = arm_svdf_s8(&ctx,
+ &input_ctx,
+ &output_ctx,
+ &svdf_int8_2_params,
+ &input_quant_params,
+ &output_quant_params,
+ &input_dims,
+ input_data,
+ &state_dims,
+ state_data,
+ &weights_feature_dims,
+ weights_feature_data,
+ &weights_time_dims,
+ weights_time_data,
+ &bias_dims,
+ svdf_int8_2_biases,
+ &output_dims,
+ output_data);
+ TEST_ASSERT_EQUAL(expected, result);
+ }
+
+ TEST_ASSERT_TRUE(validate(output_data, output_ref, output_ref_size));
+ }
+
+ if (ctx.buf)
+ {
+ // The caller is responsible to clear the scratch buffers for security reasons if applicable.
+ memset(ctx.buf, 0, buf_size);
+ free(ctx.buf);
+ }
+
+ free(state_data);
+ free(input_data);
+ free(input_ctx.buf);
+ free(output_ctx.buf);
+}
diff --git a/Tests/UnitTest/generate_test_data.py b/Tests/UnitTest/generate_test_data.py
index cbc154e4..fe3f0775 100755
--- a/Tests/UnitTest/generate_test_data.py
+++ b/Tests/UnitTest/generate_test_data.py
@@ -1724,6 +1724,22 @@ def load_testdata_sets(regenerate_input, regenerate_weights, regenerate_biases,
generate_bias=False,
int8_time_weights=True,
interpreter=interpreter)
+ dataset = 'svdf_int8_2'
+ testdata_sets[dataset] = SVDFSettings(dataset,
+ type_of_test,
+ regenerate_weights,
+ regenerate_input,
+ regenerate_biases,
+ schema_file,
+ batches=2,
+ number_inputs=3,
+ rank=2,
+ memory_size=3,
+ input_size=40,
+ number_units=13,
+ input_zp=-12,
+ int8_time_weights=True,
+ interpreter=interpreter)
type_of_test = 'add'
dataset = 'add'