Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MVE: Move kernel sums from core loop for FC and SVDF #69

Merged
merged 2 commits into from
Sep 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s16.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_vector_sum_s8.c"/>
<file category="source" name="Source/LSTMFunctions/arm_lstm_unidirectional_s8_s16.c"/>
<file category="source" name="Source/SoftmaxFunctions/arm_softmax_s8.c"/>
<file category="source" name="Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c"/>
Expand Down
69 changes: 64 additions & 5 deletions Include/arm_nnfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
* $Date: 13 January 2023
* $Revision: V.11.3.0
* $Date: 5 September 2023
* $Revision: V.12.0.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -1032,7 +1032,10 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dim
* C_OUT : Output depth
* H & W : Not used.
* @param[in, out] output_data Output data pointer. Data type: int8
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
* @return The function returns either
* <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
* <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
*
* @details
* - Supported framework: TensorFlow Lite
Expand All @@ -1049,8 +1052,24 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Calculate vector sums that may be required by arm_fully_connected_s8().
* @param[in, out] vector_sum_buf Buffer for vector sums
* @param[in] vector_cols Number of vector columns
* @param[in] vector_rows Number of vector rows
* @param[in] vector_data Vector or weigths data
* @return The function returns
* <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
* <code>ARM_CMSIS_NN_ARG_ERROR</code> - If not for Arm(R) Helium Architecture case.
*/
arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf,
const int32_t vector_cols,
const int32_t vector_rows,
const int8_t *vector_data);

/**
* @brief Get size of additional buffer required by arm_fully_connected_s8().
* See also arm_vector_sum_s8, which is required if buffer size is > 0.
* @param[in] filter_dims dimension of filter
* @return The function returns required buffer size in bytes
*
Expand Down Expand Up @@ -1851,6 +1870,15 @@ void arm_concatenation_s8_w(const int8_t *input,
/**
* @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* The caller is expected to clear the buffer ,if applicable, for security reasons.

* @param[in, out] ctx Function context that contains the additional buffer if required by the function.
* arm_fully_connected_s8_get_buffer_size will return the buffer_size if required.
* The caller is expected to clear the buffer ,if applicable, for security reasons.
* @param[in] input_ctx Temporary scratch buffer
* The caller is expected to clear the buffer ,if applicable, for security reasons.
* @param[in] output_ctx Temporary output scratch buffer
Expand All @@ -1873,12 +1901,15 @@ void arm_concatenation_s8_w(const int8_t *input,
* @param[in] output_dims Output tensor dimensions
* @param[out] output_data Pointer to the output tensor
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
* @return The function returns either
* <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
* <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
*
* @details
* 1. Supported framework: TensorFlow Lite micro
*/
arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx,
const cmsis_nn_context *input_ctx,
const cmsis_nn_context *output_ctx,
const cmsis_nn_svdf_params *svdf_params,
const cmsis_nn_per_tensor_quant_params *input_quant_params,
Expand Down Expand Up @@ -2012,6 +2043,34 @@ arm_cmsis_nn_status arm_lstm_unidirectional_s16_s8(cmsis_nn_lstm_context *scratc
int16_t *cell_state,
int8_t *output_data);

/**
* @brief Get size of additional buffer required by arm_svdf_s8().
* @param[in] filter_dims dimension of filter
* @return The function returns required buffer size in bytes
*
*/
int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);

/**
* @brief Get size of additional buffer required by arm_svdf_s8() for processors with DSP extension.
* Refer to arm_svdf_s8_get_buffer_size() for function argument details.
*
* @note Intended for compilation on Host. If compiling for an Arm target, use
* arm_svdf_s8_get_buffer_size().
*
*/
int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);

/**
* @brief Get size of additional buffer required by arm_svdf_s8() for Arm(R) Helium Architecture case.
* Refer to arm_svdf_s8_get_buffer_size() for function argument details.
*
* @note Intended for compilation on Host. If compiling for an Arm target, use
* arm_svdf_s8_get_buffer_size().
*
*/
int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);

#ifdef __cplusplus
}
#endif
Expand Down
1 change: 1 addition & 0 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,7 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const int8_t *lhs,
*/
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
const int8_t *rhs,
const int32_t *kernel_sum,
const int32_t *bias,
int8_t *dst,
const int32_t lhs_offset,
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ processors here are Cortex-M4 or a Cortex-M33 configured with optional DSP exten
Processors with Arm Helium Technology use the Arm M-profile Vector Extension(MVE) instructions for optimization.
Examples are Cortex-M55 or Cortex-M85 configured with MVE.


| Operator | C <br> int8 | C<br>int16 | DSP<br>int8 | DSP<br>int16 | MVE<br>int8 | MVE<br>int16 |
| --------------- | ----------- | ---------- | ----------- | ------------ | ----------- | ------------ |
| Conv2D | Yes | Yes | Yes | Yes | Yes | Yes |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_fully_connected_get_buffer_sizes_s8.c
* Description: Collection of get buffer size functions for fully connected s8 layer function.
*
* $Date: 31 January 2023
* $Revision: V.1.0.0
* $Date: 15 August 2023
* $Revision: V.1.1.0
*
* Target : Arm(R) M-Profile Architecture
*
Expand All @@ -39,20 +39,24 @@
* @{
*/

int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims)
{
(void)filter_dims;
return 0;
}

int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims)
int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims)
{
return arm_fully_connected_s8_get_buffer_size(filter_dims);
return filter_dims->c * sizeof(int32_t);
}

int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims)
int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
{
return arm_fully_connected_s8_get_buffer_size(filter_dims);
#if defined(ARM_MATH_MVEI)
return arm_fully_connected_s8_get_buffer_size_mve(filter_dims);
#else
return arm_fully_connected_s8_get_buffer_size_dsp(filter_dims);
#endif
}

/**
Expand Down
12 changes: 11 additions & 1 deletion Source/FullyConnectedFunctions/arm_fully_connected_s8.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,24 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
int8_t *output)
{
(void)bias_dims;
(void)ctx;
(void)fc_params->filter_offset;

int32_t batch_cnt = input_dims->n;

#if defined(ARM_MATH_MVEI)
if (ctx->buf == NULL)
{
return (ARM_CMSIS_NN_ARG_ERROR);
}
#endif

const int32_t *kernel_sum = ctx->buf;

while (batch_cnt)
{
arm_nn_vec_mat_mult_t_s8(input,
kernel,
kernel_sum,
bias,
output,
fc_params->input_offset,
Expand All @@ -80,6 +89,7 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
fc_params->activation.min,
fc_params->activation.max,
1L);

input += filter_dims->n;
output += output_dims->c;
batch_cnt--;
Expand Down
144 changes: 144 additions & 0 deletions Source/FullyConnectedFunctions/arm_vector_sum_s8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
/*
* SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <[email protected]>
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_vector_sum_s8
* Description: Generic function for calculating vector sums
*
* $Date: 5 September 2023
* $Revision: V.1.0.0
*
* Target : Arm(R) M-Profile Architecture
*
* -------------------------------------------------------------------- */

#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"

/**
* @ingroup Public
*/

/**
* @addtogroup FC
* @{
*/

/*
* S8 vector sum fuction in preparation for e.g. kernel sums in fully connected and matrix multiplication layer function
*
* Refer header file for details.
*
*/
arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf,
const int32_t vector_cols,
const int32_t vector_rows,
const int8_t *vector_data)
{
#if defined(ARM_MATH_MVEI)
const int32_t row_loop_cnt = vector_rows / 4;

for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
{
const int32_t col_loop_cnt = (vector_cols + 15) / 16;

const int8_t *vector_0 = vector_data;
const int8_t *vector_1 = vector_data + vector_cols;
const int8_t *vector_2 = vector_data + 2 * vector_cols;
const int8_t *vector_3 = vector_data + 3 * vector_cols;

int32_t vector_sum_0 = 0;
int32_t vector_sum_1 = 0;
int32_t vector_sum_2 = 0;
int32_t vector_sum_3 = 0;

uint32_t col_cnt = (uint32_t)vector_cols;

for (int i = 0; i < col_loop_cnt; i++)
{
mve_pred16_t p = vctp8q(col_cnt);
col_cnt -= 16;

const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p);
vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0);

const int8x16_t ker_1 = vldrbq_z_s8(vector_1, p);
vector_sum_1 = vaddvaq_s8(vector_sum_1, ker_1);

const int8x16_t ker_2 = vldrbq_z_s8(vector_2, p);
vector_sum_2 = vaddvaq_s8(vector_sum_2, ker_2);

const int8x16_t ker_3 = vldrbq_z_s8(vector_3, p);
vector_sum_3 = vaddvaq_s8(vector_sum_3, ker_3);

vector_0 += 16;
vector_1 += 16;
vector_2 += 16;
vector_3 += 16;
}
vector_data += 4 * vector_cols;

vector_sum_buf[0] = vector_sum_0;
vector_sum_buf[1] = vector_sum_1;
vector_sum_buf[2] = vector_sum_2;
vector_sum_buf[3] = vector_sum_3;
vector_sum_buf += 4;
}

const int32_t loop_cnt = vector_rows % 4;

for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++)
{
const int32_t col_loop_cnt = (vector_cols + 15) / 16;

const int8_t *vector_0 = vector_data;

int32_t vector_sum_0 = 0;

uint32_t col_cnt = (uint32_t)vector_cols;

for (int i = 0; i < col_loop_cnt; i++)
{
mve_pred16_t p = vctp8q(col_cnt);
col_cnt -= 16;

const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p);
vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0);

vector_0 += 16;
}
vector_data += vector_cols;

vector_sum_buf[i_row_loop_cnt] = vector_sum_0;
}

return (ARM_CMSIS_NN_SUCCESS);
#else
(void)vector_sum_buf;
(void)vector_rows;
(void)vector_cols;
(void)vector_data;

return (ARM_CMSIS_NN_NO_IMPL_ERROR);
#endif
}

/**
* @} end of FC group
*/
Loading