Skip to content

Commit

Permalink
Minimum and Maximum s8 operator support (#148)
Browse files Browse the repository at this point in the history
 * Adds Minimum and Maximum functions for s8
 * Adds Refactored Unit Tests for Minimum and Maximum s8
 * Fix small issue with build_and_run_tests.sh

Change-Id: I38333a14888b59293dcafa633105ec65c2d582a1

---------

Signed-off-by: Ryan O'Shea <[email protected]>
  • Loading branch information
ArmRyan authored Oct 16, 2024
1 parent 411ff0d commit 5f8f1a9
Show file tree
Hide file tree
Showing 86 changed files with 2,802 additions and 6 deletions.
2 changes: 2 additions & 0 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@
<file category="source" name="Source/BasicMathFunctions/arm_elementwise_add_s8.c"/>
<file category="source" name="Source/BasicMathFunctions/arm_elementwise_add_s16.c"/>
<file category="source" name="Source/BasicMathFunctions/arm_elementwise_mul_s16_s8.c"/>
<file category="source" name="Source/BasicMathFunctions/arm_minimum_s8.c"/>
<file category="source" name="Source/BasicMathFunctions/arm_maximum_s8.c"/>
<file category="source" name="Source/ActivationFunctions/arm_relu6_s8.c"/>
<file category="source" name="Source/ActivationFunctions/arm_relu_q15.c"/>
<file category="source" name="Source/ActivationFunctions/arm_relu_q7.c"/>
Expand Down
56 changes: 54 additions & 2 deletions Include/arm_nnfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
* $Date: 5 Sep 2024
* $Revision: V.17.0.0
* $Date: 08 October 2024
* $Revision: V.17.1.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -2780,6 +2780,58 @@ arm_cmsis_nn_status arm_batch_matmul_s16(const cmsis_nn_context *ctx,
const cmsis_nn_dims *output_dims,
int16_t *output);

/**
* @brief Elementwise binary minimum with 8bit data.
*
* @param[in] ctx Temporary scratch buffer
* The caller is expected to clear the buffer, if applicable, for security reasons.
* @param[in] input_1_data Pointer to input1 tensor
* @param[in] input_1_dims Input1 tensor dimensions
* @param[in] input_2_data Pointer to input2 tensor
* @param[in] input_2_dims Input2 tensor dimensions
* @param[out] output_data Pointer to the output tensor
* @param[in] output_dims Output tensor dimensions
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite Micro
*
*/
arm_cmsis_nn_status arm_minimum_s8(const cmsis_nn_context *ctx,
const int8_t *input_1_data,
const cmsis_nn_dims *input_1_dims,
const int8_t *input_2_data,
const cmsis_nn_dims *input_2_dims,
int8_t *output_data,
const cmsis_nn_dims *output_dims);

/**
* @brief Elementwise binary maximum with 8bit data.
*
* @param[in] ctx Temporary scratch buffer
* The caller is expected to clear the buffer, if applicable, for security reasons.
* @param[in] input_1_data Pointer to input1 tensor
* @param[in] input_1_dims Input1 tensor dimensions
* @param[in] input_2_data Pointer to input2 tensor
* @param[in] input_2_dims Input2 tensor dimensions
* @param[out] output_data Pointer to the output tensor
* @param[in] output_dims Output tensor dimensions
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
* @details
* 1. Supported framework: TensorFlow Lite Micro
*
*/
arm_cmsis_nn_status arm_maximum_s8(const cmsis_nn_context *ctx,
const int8_t *input_1_data,
const cmsis_nn_dims *input_1_dims,
const int8_t *input_2_data,
const cmsis_nn_dims *input_2_dims,
int8_t *output_data,
const cmsis_nn_dims *output_dims);

#ifdef __cplusplus
}
#endif
Expand Down
24 changes: 22 additions & 2 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 12 Jul 2024
* $Revision: V.22.3.0
* $Date: 08 October 2024
* $Revision: V.22.4.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -2073,6 +2073,26 @@ arm_cmsis_nn_status arm_elementwise_mul_acc_s16(const int16_t *input_1_vect,
const int32_t out_activation_max,
const int32_t block_size);

/**
* @brief Check if a broadcast is required between 2 cmsis_nn_dims.
* @param[in] shape_1 pointer to input tensor 1
* @param[in] shape_2 pointer to input tensor 2
* @return The function returns 1 if a broadcast is required, or 0 if not.
*
* @details Compares each dimension and returns 1 if any dimension does not match.
* This function does not check that broadcast rules are met.
*/
__STATIC_FORCEINLINE int32_t arm_check_broadcast_required(const cmsis_nn_dims *shape_1, const cmsis_nn_dims *shape_2)
{
if ((shape_1->n != shape_2->n) || (shape_1->h != shape_2->h) || (shape_1->w != shape_2->w) ||
(shape_1->c != shape_2->c))
{
return 1;
}

return 0;
}

#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ Examples are Cortex-M55 or Cortex-M85 configured with MVE.
| Fully Connected | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
| Batch Matmul | Yes | Yes | No | Yes | Yes | No | Yes | Yes | No |
| Add | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes | N/A |
| Minimum | Yes | No | N/A | No | No | N/A | Yes | No | N/A |
| Maximum | Yes | No | N/A | No | No | N/A | Yes | No | N/A |
| Mul | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes | N/A |
| MaxPooling | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes | N/A |
| AvgPooling | Yes | Yes | N/A | Yes | Yes | N/A | Yes | Yes | N/A |
Expand Down
263 changes: 263 additions & 0 deletions Source/BasicMathFunctions/arm_maximum_s8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
/*
* SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <[email protected]>
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_maximum_s8
* Description: Minimum and Maximum
*
* $Date: 08 October 2024
* $Revision: V.1.0.0
*
* Target : Arm(R) M-Profile Architecture
*
* -------------------------------------------------------------------- */

#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"

/**
* @ingroup Public
*/

/**
* @addtogroup minimumMaximum
* @{
*/

static arm_cmsis_nn_status
arm_max_no_broadcast_s8(const int8_t *input_1, const int8_t *input_2, int8_t *output, int32_t flat_size)
{
#if defined(ARM_MATH_MVEI)
while (flat_size > 0)
{
mve_pred16_t p = vctp8q(flat_size);

int8x16_t vec1 = vldrbq_z_s8(input_1, p);
input_1 += 16;
int8x16_t vec2 = vldrbq_z_s8(input_2, p);
input_2 += 16;

vstrbq_p_s8(output, vmaxq_s8(vec1, vec2), p);
output += 16;
flat_size -= 16;
}
#else
while (flat_size > 0)
{
int8_t in1 = *input_1++;
int8_t in2 = *input_2++;
*output++ = in1 >= in2 ? in1 : in2;
--flat_size;
}
#endif

return ARM_CMSIS_NN_SUCCESS;
}

static arm_cmsis_nn_status
arm_max_scalar_s8(const int8_t *input_1, const int8_t *input_2, int8_t *output, int32_t flat_size)
{
#if defined(ARM_MATH_MVEI)
int8x16_t scalar_vec = vdupq_n_s8(*input_1);

while (flat_size > 0)
{
mve_pred16_t p = vctp8q(flat_size);
int8x16_t vec = vldrbq_z_s8(input_2, p);
input_2 += 16;

vstrbq_p_s8(output, vmaxq_s8(scalar_vec, vec), p);
output += 16;
flat_size -= 16;
}
#else
int8_t in1 = *input_1;
while (flat_size > 0)
{
int8_t in2 = *input_2++;
*output++ = in1 >= in2 ? in1 : in2;
--flat_size;
}
#endif
return ARM_CMSIS_NN_SUCCESS;
}

/*
* s8 maximum
*
* Refer header file for details.
*
*/
arm_cmsis_nn_status arm_maximum_s8(const cmsis_nn_context *ctx,
const int8_t *input_1_data,
const cmsis_nn_dims *input_1_dims,
const int8_t *input_2_data,
const cmsis_nn_dims *input_2_dims,
int8_t *output_data,
const cmsis_nn_dims *output_dims)
{
(void)ctx;
const int32_t output_batch = output_dims->n;
const int32_t output_height = output_dims->h;
const int32_t output_width = output_dims->w;

const int32_t input_1_batch = input_1_dims->n;
const int32_t input_1_height = input_1_dims->h;
const int32_t input_1_width = input_1_dims->w;
const int32_t input_1_channels = input_1_dims->c;

const int32_t input_2_batch = input_2_dims->n;
const int32_t input_2_height = input_2_dims->h;
const int32_t input_2_width = input_2_dims->w;
const int32_t input_2_channels = input_2_dims->c;

int32_t flat_size_1 = input_1_batch * input_1_height * input_1_width * input_1_channels;
int32_t flat_size_2 = input_2_batch * input_2_height * input_2_width * input_2_channels;

if (arm_check_broadcast_required(input_1_dims, input_2_dims))
{
if (flat_size_1 == 1)
{
// arm_max_scalar expects the tensor with the scalar value to be provided first
arm_max_scalar_s8(input_1_data, input_2_data, output_data, flat_size_2);
}
else if (flat_size_2 == 1)
{
// arm_max_scalar expects the tensor with the scalar value to be provided first
arm_max_scalar_s8(input_2_data, input_1_data, output_data, flat_size_1);
}
else
{
int32_t width_1_diff = input_1_width >= input_2_width ? 0 : input_1_channels;
int32_t width_2_diff = input_2_width >= input_1_width ? 0 : input_2_channels;

int32_t height_1_diff =
input_1_height >= input_2_height ? width_1_diff : -input_1_width * (input_1_channels - width_1_diff);
int32_t height_2_diff =
input_2_height >= input_1_height ? width_2_diff : -input_2_width * (input_2_channels - width_2_diff);

int32_t batch_1_diff =
input_1_batch >= input_2_batch ? input_1_channels * input_1_width * input_1_height : 0;
int32_t batch_2_diff =
input_2_batch >= input_1_batch ? input_2_channels * input_2_width * input_2_height : 0;

for (int32_t i_out_batch = 0; i_out_batch < output_batch; i_out_batch++)
{
const int8_t *input_1_ptr = input_1_data;
const int8_t *input_2_ptr = input_2_data;
flat_size_1 = input_1_height * input_1_width * input_1_channels;
flat_size_2 = input_2_height * input_2_width * input_2_channels;
if (input_1_height == input_2_height && input_1_width == input_2_width &&
input_1_channels == input_2_channels)
{
arm_max_no_broadcast_s8(input_1_ptr, input_2_ptr, output_data, flat_size_1);
output_data += flat_size_1;
}
else if (flat_size_1 == 1)
{
// arm_max_scalar expects the tensor with the scalar value to be provided first
arm_max_scalar_s8(input_1_ptr, input_2_ptr, output_data, flat_size_2);
output_data += flat_size_2;
}
else if (flat_size_2 == 1)
{
// arm_max_scalar expects the tensor with the scalar value to be provided first
arm_max_scalar_s8(input_2_ptr, input_1_ptr, output_data, flat_size_1);
output_data += flat_size_1;
}
else
{
flat_size_1 = input_1_width * input_1_channels;
flat_size_2 = input_2_width * input_2_channels;
for (int32_t i_out_height = 0; i_out_height < output_height; i_out_height++)
{
if (input_1_width == input_2_width && input_1_channels == input_2_channels)
{
arm_max_no_broadcast_s8(input_1_ptr, input_2_ptr, output_data, flat_size_1);
output_data += flat_size_1;
input_1_ptr += flat_size_1;
input_2_ptr += flat_size_1;
}
else if (flat_size_1 == 1)
{
// arm_max_scalar expects the tensor with the scalar value to be provided first
arm_max_scalar_s8(input_1_ptr, input_2_ptr, output_data, flat_size_2);
output_data += flat_size_2;
++input_1_ptr;
input_2_ptr += flat_size_2;
}
else if (flat_size_2 == 1)
{
// arm_max_scalar expects the tensor with the scalar value to be provided first
arm_max_scalar_s8(input_2_ptr, input_1_ptr, output_data, flat_size_1);
output_data += flat_size_1;
++input_2_ptr;
input_1_ptr += flat_size_1;
}
else
{
for (int32_t i_out_width = 0; i_out_width < output_width; i_out_width++)
{
if (input_1_channels == input_2_channels)
{
arm_max_no_broadcast_s8(input_1_ptr, input_2_ptr, output_data, input_1_channels);
output_data += input_1_channels;
input_1_ptr += input_1_channels;
input_2_ptr += input_1_channels;
}
else if (input_1_channels == 1)
{
// arm_max_scalar expects the tensor with the scalar value to be provided first
arm_max_scalar_s8(input_1_ptr, input_2_ptr, output_data, input_2_channels);
output_data += input_2_channels;
input_1_ptr++;
input_2_ptr += input_2_channels;
}
else if (input_2_channels == 1)
{
// arm_max_scalar expects the tensor with the scalar value to be provided first
arm_max_scalar_s8(input_2_ptr, input_1_ptr, output_data, input_1_channels);
output_data += input_1_channels;
input_1_ptr += input_1_channels;
input_2_ptr++;
}
input_1_ptr -= width_1_diff;
input_2_ptr -= width_2_diff;
}
}
input_1_ptr += height_1_diff;
input_2_ptr += height_2_diff;
}
}
input_1_data += batch_1_diff;
input_2_data += batch_2_diff;
}
}
}
else
{
arm_max_no_broadcast_s8(input_1_data, input_2_data, output_data, flat_size_1);
}

return (ARM_CMSIS_NN_SUCCESS);
}

/**
* @} end of Doxygen group
*/
Loading

0 comments on commit 5f8f1a9

Please sign in to comment.