Minimum and Maximum s8 operator support (#148)

* Adds Minimum and Maximum functions for s8 * Adds Refactored Unit Tests for Minimum and Maximum s8 * Fix small issue with build_and_run_tests.sh Change-Id: I38333a14888b59293dcafa633105ec65c2d582a1 --------- Signed-off-by: Ryan O'Shea <[email protected]>
ARM-software · Oct 16, 2024 · 5f8f1a9 · 5f8f1a9
1 parent 411ff0d
commit 5f8f1a9
Show file tree

Hide file tree

Showing 86 changed files with 2,802 additions and 6 deletions.
diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
@@ -96,6 +96,8 @@
         <file category="source" name="Source/BasicMathFunctions/arm_elementwise_add_s8.c"/>
         <file category="source" name="Source/BasicMathFunctions/arm_elementwise_add_s16.c"/>
         <file category="source" name="Source/BasicMathFunctions/arm_elementwise_mul_s16_s8.c"/>
+        <file category="source" name="Source/BasicMathFunctions/arm_minimum_s8.c"/>
+        <file category="source" name="Source/BasicMathFunctions/arm_maximum_s8.c"/>
         <file category="source" name="Source/ActivationFunctions/arm_relu6_s8.c"/>
         <file category="source" name="Source/ActivationFunctions/arm_relu_q15.c"/>
         <file category="source" name="Source/ActivationFunctions/arm_relu_q7.c"/>

diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        5 Sep 2024
- * $Revision:    V.17.0.0
+ * $Date:        08 October 2024
+ * $Revision:    V.17.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -2780,6 +2780,58 @@ arm_cmsis_nn_status arm_batch_matmul_s16(const cmsis_nn_context *ctx,
                                          const cmsis_nn_dims *output_dims,
                                          int16_t *output);
 
+/**
+ * @brief Elementwise binary minimum with 8bit data.
+ *
+ * @param[in]   ctx                   Temporary scratch buffer
+ *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
+ * @param[in]   input_1_data          Pointer to input1 tensor
+ * @param[in]   input_1_dims          Input1 tensor dimensions
+ * @param[in]   input_2_data          Pointer to input2 tensor
+ * @param[in]   input_2_dims          Input2 tensor dimensions
+ * @param[out]  output_data           Pointer to the output tensor
+ * @param[in]   output_dims           Output tensor dimensions
+ *
+ * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ * @details
+ *    1. Supported framework: TensorFlow Lite Micro
+ *
+ */
+arm_cmsis_nn_status arm_minimum_s8(const cmsis_nn_context *ctx,
+                                   const int8_t *input_1_data,
+                                   const cmsis_nn_dims *input_1_dims,
+                                   const int8_t *input_2_data,
+                                   const cmsis_nn_dims *input_2_dims,
+                                   int8_t *output_data,
+                                   const cmsis_nn_dims *output_dims);
+
+/**
+ * @brief Elementwise binary maximum with 8bit data.
+ *
+ * @param[in]   ctx                   Temporary scratch buffer
+ *                                    The caller is expected to clear the buffer, if applicable, for security reasons.
+ * @param[in]   input_1_data          Pointer to input1 tensor
+ * @param[in]   input_1_dims          Input1 tensor dimensions
+ * @param[in]   input_2_data          Pointer to input2 tensor
+ * @param[in]   input_2_dims          Input2 tensor dimensions
+ * @param[out]  output_data           Pointer to the output tensor
+ * @param[in]   output_dims           Output tensor dimensions
+ *
+ * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ * @details
+ *    1. Supported framework: TensorFlow Lite Micro
+ *
+ */
+arm_cmsis_nn_status arm_maximum_s8(const cmsis_nn_context *ctx,
+                                   const int8_t *input_1_data,
+                                   const cmsis_nn_dims *input_1_dims,
+                                   const int8_t *input_2_data,
+                                   const cmsis_nn_dims *input_2_dims,
+                                   int8_t *output_data,
+                                   const cmsis_nn_dims *output_dims);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        12 Jul 2024
- * $Revision:    V.22.3.0
+ * $Date:        08 October 2024
+ * $Revision:    V.22.4.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -2073,6 +2073,26 @@ arm_cmsis_nn_status arm_elementwise_mul_acc_s16(const int16_t *input_1_vect,
                                                 const int32_t out_activation_max,
                                                 const int32_t block_size);
 
+/**
+ * @brief Check if a broadcast is required between 2 cmsis_nn_dims.
+ * @param[in]       shape_1             pointer to input tensor 1
+ * @param[in]       shape_2             pointer to input tensor 2
+ * @return          The function returns 1 if a broadcast is required, or 0 if not.
+ *
+ * @details   Compares each dimension and returns 1 if any dimension does not match.
+ *            This function does not check that broadcast rules are met.
+ */
+__STATIC_FORCEINLINE int32_t arm_check_broadcast_required(const cmsis_nn_dims *shape_1, const cmsis_nn_dims *shape_2)
+{
+    if ((shape_1->n != shape_2->n) || (shape_1->h != shape_2->h) || (shape_1->w != shape_2->w) ||
+        (shape_1->c != shape_2->c))
+    {
+        return 1;
+    }
+
+    return 0;
+}
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/README.md b/README.md
@@ -32,6 +32,8 @@ Examples are Cortex-M55 or Cortex-M85 configured with MVE.
 | Fully Connected | Yes         | Yes        | Yes        | Yes         | Yes          | Yes          | Yes         | Yes          | Yes          |
 | Batch Matmul    | Yes         | Yes        | No         | Yes         | Yes          | No           | Yes         | Yes          | No           |
 | Add             | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          | N/A          |
+| Minimum         | Yes         | No         | N/A        | No          | No           | N/A          | Yes         | No           | N/A          |
+| Maximum         | Yes         | No         | N/A        | No          | No           | N/A          | Yes         | No           | N/A          |
 | Mul             | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          | N/A          |
 | MaxPooling      | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          | N/A          |
 | AvgPooling      | Yes         | Yes        | N/A        | Yes         | Yes          | N/A          | Yes         | Yes          | N/A          |

diff --git a/Source/BasicMathFunctions/arm_maximum_s8.c b/Source/BasicMathFunctions/arm_maximum_s8.c
@@ -0,0 +1,263 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <[email protected]>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_maximum_s8
+ * Description:  Minimum and Maximum
+ *
+ * $Date:        08 October 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup minimumMaximum
+ * @{
+ */
+
+static arm_cmsis_nn_status
+arm_max_no_broadcast_s8(const int8_t *input_1, const int8_t *input_2, int8_t *output, int32_t flat_size)
+{
+#if defined(ARM_MATH_MVEI)
+    while (flat_size > 0)
+    {
+        mve_pred16_t p = vctp8q(flat_size);
+
+        int8x16_t vec1 = vldrbq_z_s8(input_1, p);
+        input_1 += 16;
+        int8x16_t vec2 = vldrbq_z_s8(input_2, p);
+        input_2 += 16;
+
+        vstrbq_p_s8(output, vmaxq_s8(vec1, vec2), p);
+        output += 16;
+        flat_size -= 16;
+    }
+#else
+    while (flat_size > 0)
+    {
+        int8_t in1 = *input_1++;
+        int8_t in2 = *input_2++;
+        *output++ = in1 >= in2 ? in1 : in2;
+        --flat_size;
+    }
+#endif
+
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+static arm_cmsis_nn_status
+arm_max_scalar_s8(const int8_t *input_1, const int8_t *input_2, int8_t *output, int32_t flat_size)
+{
+#if defined(ARM_MATH_MVEI)
+    int8x16_t scalar_vec = vdupq_n_s8(*input_1);
+
+    while (flat_size > 0)
+    {
+        mve_pred16_t p = vctp8q(flat_size);
+        int8x16_t vec = vldrbq_z_s8(input_2, p);
+        input_2 += 16;
+
+        vstrbq_p_s8(output, vmaxq_s8(scalar_vec, vec), p);
+        output += 16;
+        flat_size -= 16;
+    }
+#else
+    int8_t in1 = *input_1;
+    while (flat_size > 0)
+    {
+        int8_t in2 = *input_2++;
+        *output++ = in1 >= in2 ? in1 : in2;
+        --flat_size;
+    }
+#endif
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/*
+ * s8 maximum
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_maximum_s8(const cmsis_nn_context *ctx,
+                                   const int8_t *input_1_data,
+                                   const cmsis_nn_dims *input_1_dims,
+                                   const int8_t *input_2_data,
+                                   const cmsis_nn_dims *input_2_dims,
+                                   int8_t *output_data,
+                                   const cmsis_nn_dims *output_dims)
+{
+    (void)ctx;
+    const int32_t output_batch = output_dims->n;
+    const int32_t output_height = output_dims->h;
+    const int32_t output_width = output_dims->w;
+
+    const int32_t input_1_batch = input_1_dims->n;
+    const int32_t input_1_height = input_1_dims->h;
+    const int32_t input_1_width = input_1_dims->w;
+    const int32_t input_1_channels = input_1_dims->c;
+
+    const int32_t input_2_batch = input_2_dims->n;
+    const int32_t input_2_height = input_2_dims->h;
+    const int32_t input_2_width = input_2_dims->w;
+    const int32_t input_2_channels = input_2_dims->c;
+
+    int32_t flat_size_1 = input_1_batch * input_1_height * input_1_width * input_1_channels;
+    int32_t flat_size_2 = input_2_batch * input_2_height * input_2_width * input_2_channels;
+
+    if (arm_check_broadcast_required(input_1_dims, input_2_dims))
+    {
+        if (flat_size_1 == 1)
+        {
+            // arm_max_scalar expects the tensor with the scalar value to be provided first
+            arm_max_scalar_s8(input_1_data, input_2_data, output_data, flat_size_2);
+        }
+        else if (flat_size_2 == 1)
+        {
+            // arm_max_scalar expects the tensor with the scalar value to be provided first
+            arm_max_scalar_s8(input_2_data, input_1_data, output_data, flat_size_1);
+        }
+        else
+        {
+            int32_t width_1_diff = input_1_width >= input_2_width ? 0 : input_1_channels;
+            int32_t width_2_diff = input_2_width >= input_1_width ? 0 : input_2_channels;
+
+            int32_t height_1_diff =
+                input_1_height >= input_2_height ? width_1_diff : -input_1_width * (input_1_channels - width_1_diff);
+            int32_t height_2_diff =
+                input_2_height >= input_1_height ? width_2_diff : -input_2_width * (input_2_channels - width_2_diff);
+
+            int32_t batch_1_diff =
+                input_1_batch >= input_2_batch ? input_1_channels * input_1_width * input_1_height : 0;
+            int32_t batch_2_diff =
+                input_2_batch >= input_1_batch ? input_2_channels * input_2_width * input_2_height : 0;
+
+            for (int32_t i_out_batch = 0; i_out_batch < output_batch; i_out_batch++)
+            {
+                const int8_t *input_1_ptr = input_1_data;
+                const int8_t *input_2_ptr = input_2_data;
+                flat_size_1 = input_1_height * input_1_width * input_1_channels;
+                flat_size_2 = input_2_height * input_2_width * input_2_channels;
+                if (input_1_height == input_2_height && input_1_width == input_2_width &&
+                    input_1_channels == input_2_channels)
+                {
+                    arm_max_no_broadcast_s8(input_1_ptr, input_2_ptr, output_data, flat_size_1);
+                    output_data += flat_size_1;
+                }
+                else if (flat_size_1 == 1)
+                {
+                    // arm_max_scalar expects the tensor with the scalar value to be provided first
+                    arm_max_scalar_s8(input_1_ptr, input_2_ptr, output_data, flat_size_2);
+                    output_data += flat_size_2;
+                }
+                else if (flat_size_2 == 1)
+                {
+                    // arm_max_scalar expects the tensor with the scalar value to be provided first
+                    arm_max_scalar_s8(input_2_ptr, input_1_ptr, output_data, flat_size_1);
+                    output_data += flat_size_1;
+                }
+                else
+                {
+                    flat_size_1 = input_1_width * input_1_channels;
+                    flat_size_2 = input_2_width * input_2_channels;
+                    for (int32_t i_out_height = 0; i_out_height < output_height; i_out_height++)
+                    {
+                        if (input_1_width == input_2_width && input_1_channels == input_2_channels)
+                        {
+                            arm_max_no_broadcast_s8(input_1_ptr, input_2_ptr, output_data, flat_size_1);
+                            output_data += flat_size_1;
+                            input_1_ptr += flat_size_1;
+                            input_2_ptr += flat_size_1;
+                        }
+                        else if (flat_size_1 == 1)
+                        {
+                            // arm_max_scalar expects the tensor with the scalar value to be provided first
+                            arm_max_scalar_s8(input_1_ptr, input_2_ptr, output_data, flat_size_2);
+                            output_data += flat_size_2;
+                            ++input_1_ptr;
+                            input_2_ptr += flat_size_2;
+                        }
+                        else if (flat_size_2 == 1)
+                        {
+                            // arm_max_scalar expects the tensor with the scalar value to be provided first
+                            arm_max_scalar_s8(input_2_ptr, input_1_ptr, output_data, flat_size_1);
+                            output_data += flat_size_1;
+                            ++input_2_ptr;
+                            input_1_ptr += flat_size_1;
+                        }
+                        else
+                        {
+                            for (int32_t i_out_width = 0; i_out_width < output_width; i_out_width++)
+                            {
+                                if (input_1_channels == input_2_channels)
+                                {
+                                    arm_max_no_broadcast_s8(input_1_ptr, input_2_ptr, output_data, input_1_channels);
+                                    output_data += input_1_channels;
+                                    input_1_ptr += input_1_channels;
+                                    input_2_ptr += input_1_channels;
+                                }
+                                else if (input_1_channels == 1)
+                                {
+                                    // arm_max_scalar expects the tensor with the scalar value to be provided first
+                                    arm_max_scalar_s8(input_1_ptr, input_2_ptr, output_data, input_2_channels);
+                                    output_data += input_2_channels;
+                                    input_1_ptr++;
+                                    input_2_ptr += input_2_channels;
+                                }
+                                else if (input_2_channels == 1)
+                                {
+                                    // arm_max_scalar expects the tensor with the scalar value to be provided first
+                                    arm_max_scalar_s8(input_2_ptr, input_1_ptr, output_data, input_1_channels);
+                                    output_data += input_1_channels;
+                                    input_1_ptr += input_1_channels;
+                                    input_2_ptr++;
+                                }
+                                input_1_ptr -= width_1_diff;
+                                input_2_ptr -= width_2_diff;
+                            }
+                        }
+                        input_1_ptr += height_1_diff;
+                        input_2_ptr += height_2_diff;
+                    }
+                }
+                input_1_data += batch_1_diff;
+                input_2_data += batch_2_diff;
+            }
+        }
+    }
+    else
+    {
+        arm_max_no_broadcast_s8(input_1_data, input_2_data, output_data, flat_size_1);
+    }
+
+    return (ARM_CMSIS_NN_SUCCESS);
+}
+
+/**
+ * @} end of Doxygen group
+ */