Skip to content

Commit

Permalink
Add 4-bit weight support to depthwise conv (#83)
Browse files Browse the repository at this point in the history
Unit tests are aligned with existing fully connected int4 unit tests.
  • Loading branch information
mansnils authored Nov 9, 2023
1 parent ca47625 commit edececa
Show file tree
Hide file tree
Showing 109 changed files with 4,693 additions and 1,223 deletions.
4 changes: 4 additions & 0 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_get_buffer_sizes_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4_opt.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_depthwise_conv_s4.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_fast_s16.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s8.c"/>
<file category="source" name="Source/ConvolutionFunctions/arm_convolve_s16.c"/>
Expand Down
180 changes: 177 additions & 3 deletions Include/arm_nnfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
* $Date: 27 October 2023
* $Revision: V.12.2.0
* $Date: 7 November 2023
* $Revision: V.12.3.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -743,6 +743,49 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Wrapper function to pick the right optimized s4 depthwise convolution function
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if required.
* The caller is expected to clear the buffer ,if applicable, for security reasons.
* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
* dw_conv_params->dilation is not used.
* Range of dw_conv_params->input_offset : [-127, 128]
* Range of dw_conv_params->output_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each
* output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
* Batch argument N is not used and assumed to be 1.
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
* @param[in] filter_data Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential
* weights [0x1, 0x2, 0x3, 0x4] packed as [0x21, 0x43].
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias_data Bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
* @param[in, out] output_data Output data pointer. Data type: int8
* @return The function returns
* <code>ARM_CMSIS_NN_SUCCESS</code> - Successful completion.
*
* @details
* - Supported framework: TensorFlow Lite
*/
arm_cmsis_nn_status arm_depthwise_conv_wrapper_s4(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int8_t *input_data,
const cmsis_nn_dims *filter_dims,
const int8_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
*
Expand Down Expand Up @@ -787,6 +830,50 @@ int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size_mve(const cmsis_nn_dw_conv
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims);

/**
* @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4()
*
* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
* Range of dw_conv_params->input_offset : [-127, 128]
* Range of dw_conv_params->input_offset : [-128, 127]
* @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
* Batch argument N is not used and assumed to be 1.
* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
* @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
* @return Size of additional memory required for optimizations in bytes.
*
*/
int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims);

/**
* @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4() for processors with DSP extension.
* Refer to arm_depthwise_conv_wrapper_s4_get_buffer_size() for function argument details.
*
* @note Intended for compilation on Host. If compiling for an Arm target, use
* arm_depthwise_conv_wrapper_s4_get_buffer_size().
*
*/
int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_dsp(const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims);

/**
* @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s4() for Arm(R) Helium Architecture case.
* Refer to arm_depthwise_conv_wrapper_s4_get_buffer_size() for function argument details.
*
* @note Intended for compilation on Host. If compiling for an Arm target, use
* arm_depthwise_conv_wrapper_s4_get_buffer_size().
*
*/
int32_t arm_depthwise_conv_wrapper_s4_get_buffer_size_mve(const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims,
const cmsis_nn_dims *output_dims);

/**
* @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
*
Expand Down Expand Up @@ -828,6 +915,48 @@ arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Basic s4 depthwise convolution function that doesn't have any constraints on the input dimensions.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required exists if additional memory is.
* The caller is expected to clear the buffer ,if applicable, for security reasons.
* @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
* dw_conv_params->dilation is not used.
* Range of dw_conv_params->input_offset : [-127, 128]
* Range of dw_conv_params->input_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each
* output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Batch argument N is not used.
* @param[in] input Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
* @param[in] kernel Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential
* weights [0x1, 0x2, 0x3, 0x4] packed as [0x21, 0x43].
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* @param[in] bias Bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
* @param[in, out] output Output data pointer. Data type: int8
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
* @details
* - Supported framework: TensorFlow Lite
*/
arm_cmsis_nn_status arm_depthwise_conv_s4(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int8_t *input,
const cmsis_nn_dims *filter_dims,
const int8_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias,
const cmsis_nn_dims *output_dims,
int8_t *output);

/**
* @brief Basic s16 depthwise convolution function that doesn't have any constraints on the input dimensions.
*
Expand Down Expand Up @@ -1064,6 +1193,40 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Optimized s4 depthwise convolution function with constraint that in_channel equals out_channel.
* Refer arm_depthwise_conv_s4() for function argument details.
*
* @return The function returns one of the following
* <code>ARM_CMSIS_NN_ARG_ERROR</code> - input channel != output channel or
* ch_mult != 1
* <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
*
* @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
* for the following if MVE optimizations(Arm Helium Technology) are used.
* - Output shift
* - Output multiplier
* - Output bias
* - kernel
* @details
* - Supported framework: TensorFlow Lite
* - The following constrains on the arguments apply
* -# Number of input channel equals number of output channels or ch_mult equals 1
* - Reccomended when number of channels is 4 or greater.
*
*/
arm_cmsis_nn_status arm_depthwise_conv_s4_opt(const cmsis_nn_context *ctx,
const cmsis_nn_dw_conv_params *dw_conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int8_t *input_data,
const cmsis_nn_dims *filter_dims,
const int8_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Get the required buffer size for optimized s8 depthwise convolution
* function with constraint that in_channel equals out_channel.
Expand All @@ -1075,6 +1238,17 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
*/
int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);

/**
* @brief Get the required buffer size for optimized s4 depthwise convolution
* function with constraint that in_channel equals out_channel.
* @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
* Batch argument N is not used.
* @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
* @return The function returns required buffer size in bytes
*
*/
int32_t arm_depthwise_conv_s4_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);

/**
* @defgroup FC Fully-connected Layer Functions
*
Expand Down Expand Up @@ -1110,7 +1284,7 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dim
* C : output depth and equals C_OUT in output_dims
* H & W : Not used
* @param[in] filter_data Filter data pointer. Data type: int8_t packed 4-bit weights, e.g four sequential
* weights [0x1, 0x2, 0x3, 0x4] packed as [0x21, 0x43].
* weights [0x1, 0x2, 0x3, 0x4] packed as [0x21, 0x43].
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* N, H, W : Not used
* @param[in] bias_data Bias data pointer. Data type: int32
Expand Down
45 changes: 40 additions & 5 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 3 November 2023
* $Revision: V.17.4.0
* $Date: 7 November 2023
* $Revision: V.17.5.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -160,8 +160,23 @@ void arm_q7_to_q15_with_offset(const int8_t *src, int16_t *dst, int32_t block_si
*
*/
void arm_s8_to_s16_unordered_with_offset(const int8_t *src, int16_t *dst, int32_t block_size, int16_t offset);

#endif

/**
* @brief Get the required buffer size for optimized s8 depthwise convolution
* function with constraint that in_channel equals out_channel.
* This is for processors with DSP extension.
* Refer to arm_depthwise_conv_s8_opt_get_buffer_size() for function argument details.
*
* @note Intended for compilation on Host. If compiling for an Arm target, use
* arm_depthwise_conv_s8_opt_get_buffer_size(). Note also this is a support function,
* so not recommended to call directly even on Host.
*
*/
int32_t arm_depthwise_conv_s8_opt_get_buffer_size_dsp(const cmsis_nn_dims *input_dims,
const cmsis_nn_dims *filter_dims);

/**
* @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
* @param[in] row pointer to row
Expand Down Expand Up @@ -777,14 +792,34 @@ __STATIC_FORCEINLINE void arm_memset_s8(int8_t *dst, const int8_t val, uint32_t
/**
* @brief read and expand one s4 word into two s8 words.
*/
__STATIC_FORCEINLINE const int8_t *read_and_pad_s4(const int8_t *source, int32_t *out1, int32_t *out2)

__STATIC_FORCEINLINE void read_and_pad_s4(const int8_t *source, int32_t *out1, int32_t *out2)
{
int16_t in = arm_nn_read_s8x2(source);
int32_t inA = (in & 0x00FF) | ((in & 0xFF00) << 8);

*out1 = SXTB16_RORn(__sxtb16(inA << 4), 4);
*out2 = SXTB16_RORn(__sxtb16(inA), 4);
return source;
}

/**
* @brief read and expand one s4 word into two s8 words.
* @details The s4 elements are not evenly aligned on the byte boundary, so 3 bytes need to be read instead of 2.
* In other words first nibble to read start at the middle of a byte.
* byte index, s4 element
* 0, s4_x
* 0, s4_0
* 1, s4_1
* 1, s4_2
* 2, s4_3
* 2, s4_x
*/
__STATIC_FORCEINLINE void read_and_pad_s4_uneven(const int8_t *source, int32_t *out1, int32_t *out2)
{
int32_t inA1 = (source[0] & 0xFF) | ((source[1] & 0xFF) << 16);
int32_t inA2 = (source[1] & 0xFF) | ((source[2] & 0xFF) << 16);

*out1 = SXTB16_RORn(__sxtb16(inA2 << 4), 4);
*out2 = SXTB16_RORn(__sxtb16(inA1), 4);
}

/**
Expand Down
Loading

0 comments on commit edececa

Please sign in to comment.