diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
index 573ee1a0..998a176c 100644
--- a/ARM.CMSIS-NN.pdsc
+++ b/ARM.CMSIS-NN.pdsc
@@ -99,6 +99,7 @@
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s16.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c"/>
+        <file category="source" name="Source/FullyConnectedFunctions/arm_vector_sum_s8.c"/>
         <file category="source" name="Source/LSTMFunctions/arm_lstm_unidirectional_s8_s16.c"/>
         <file category="source" name="Source/SoftmaxFunctions/arm_softmax_s8.c"/>
         <file category="source" name="Source/SoftmaxFunctions/arm_nn_softmax_common_s8.c"/>
diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h
index dbe13bf6..7541a1a8 100644
--- a/Include/arm_nnfunctions.h
+++ b/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        13 January 2023
- * $Revision:    V.11.3.0
+ * $Date:        5 September 2023
+ * $Revision:    V.12.0.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -1032,7 +1032,10 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dim
  *                               C_OUT : Output depth
  *                               H & W : Not used.
  * @param[in, out] output_data    Output data pointer. Data type: int8
- * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ * @return     The function returns either
+ *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
+ *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
  *
  * @details
  *    - Supported framework: TensorFlow Lite
@@ -1049,8 +1052,24 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
                                            const cmsis_nn_dims *output_dims,
                                            int8_t *output_data);
 
+/**
+ * @brief Calculate vector sums that may be required by arm_fully_connected_s8().
+ * @param[in, out]      vector_sum_buf              Buffer for vector sums
+ * @param[in]           vector_cols                 Number of vector columns
+ * @param[in]           vector_rows                 Number of vector rows
+ * @param[in]           vector_data                 Vector or weigths data
+ * @return              The function returns
+ *                         <code>ARM_CMSIS_NN_SUCCESS</code> - Successful operation
+ *                         <code>ARM_CMSIS_NN_ARG_ERROR</code> - If not for Arm(R) Helium Architecture case.
+ */
+arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf,
+                                      const int32_t vector_cols,
+                                      const int32_t vector_rows,
+                                      const int8_t *vector_data);
+
 /**
  * @brief Get size of additional buffer required by arm_fully_connected_s8().
+ *        See also arm_vector_sum_s8, which is required if buffer size is > 0.
  * @param[in]      filter_dims             dimension of filter
  * @return         The function returns    required buffer size in bytes
  *
@@ -1851,6 +1870,15 @@ void arm_concatenation_s8_w(const int8_t *input,
 /**
  * @brief s8 SVDF function with 8 bit state tensor and 8 bit time weights
  *
+ * @param[in, out] ctx                Function context (e.g. temporary buffer). Check the function
+ *                                    definition file to see if an additional buffer is required.
+ *                                    Optional function arm_fully_connected_s8_get_buffer_size() provides the buffer
+ *                                    size if an additional buffer is required.
+ *                                    The caller is expected to clear the buffer ,if applicable, for security reasons.
+
+ * @param[in, out] ctx                Function context that contains the additional buffer if required by the function.
+ *                                    arm_fully_connected_s8_get_buffer_size will return the buffer_size if required.
+ *                                    The caller is expected to clear the buffer ,if applicable, for security reasons.
  * @param[in]   input_ctx             Temporary scratch buffer
  *                                    The caller is expected to clear the buffer ,if applicable, for security reasons.
  * @param[in]   output_ctx            Temporary output scratch buffer
@@ -1873,12 +1901,15 @@ void arm_concatenation_s8_w(const int8_t *input,
  * @param[in]   output_dims           Output tensor dimensions
  * @param[out]  output_data           Pointer to the output tensor
  *
- * @return     The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ * @return     The function returns either
+ *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
+ *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
  *
  * @details
  *    1. Supported framework: TensorFlow Lite micro
  */
-arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
+arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx,
+                                const cmsis_nn_context *input_ctx,
                                 const cmsis_nn_context *output_ctx,
                                 const cmsis_nn_svdf_params *svdf_params,
                                 const cmsis_nn_per_tensor_quant_params *input_quant_params,
@@ -2012,6 +2043,34 @@ arm_cmsis_nn_status arm_lstm_unidirectional_s16_s8(cmsis_nn_lstm_context *scratc
                                                    int16_t *cell_state,
                                                    int8_t *output_data);
 
+/**
+ * @brief Get size of additional buffer required by arm_svdf_s8().
+ * @param[in]      filter_dims             dimension of filter
+ * @return         The function returns    required buffer size in bytes
+ *
+ */
+int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
+
+/**
+ * @brief Get size of additional buffer required by arm_svdf_s8() for processors with DSP extension.
+ *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_svdf_s8_get_buffer_size().
+ *
+ */
+int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims);
+
+/**
+ * @brief Get size of additional buffer required by arm_svdf_s8() for Arm(R) Helium Architecture case.
+ *        Refer to arm_svdf_s8_get_buffer_size() for function argument details.
+ *
+ * @note       Intended for compilation on Host. If compiling for an Arm target, use
+ *             arm_svdf_s8_get_buffer_size().
+ *
+ */
+int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
index 09f5bc84..04feada6 100644
--- a/Include/arm_nnsupportfunctions.h
+++ b/Include/arm_nnsupportfunctions.h
@@ -394,6 +394,7 @@ arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8(const int8_t *lhs,
  */
 arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
                                              const int8_t *rhs,
+                                             const int32_t *kernel_sum,
                                              const int32_t *bias,
                                              int8_t *dst,
                                              const int32_t lhs_offset,
diff --git a/README.md b/README.md
index 97d0afb1..e713f1f6 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,6 @@ processors here are Cortex-M4 or a Cortex-M33 configured with optional DSP exten
 Processors with Arm Helium Technology use the Arm M-profile Vector Extension(MVE) instructions for optimization.
 Examples are Cortex-M55 or Cortex-M85 configured with MVE.
 
- 
 | Operator        | C <br> int8 | C<br>int16 | DSP<br>int8 | DSP<br>int16 | MVE<br>int8 | MVE<br>int16 |
 | --------------- | ----------- | ---------- | ----------- | ------------ | ----------- | ------------ |
 | Conv2D          | Yes         | Yes        | Yes         | Yes          | Yes         | Yes          |
diff --git a/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c b/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c
index 07618742..34b2abff 100644
--- a/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c
+++ b/Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_get_buffer_sizes_s8.c
  * Description:  Collection of get buffer size functions for fully connected s8 layer function.
  *
- * $Date:        31 January 2023
- * $Revision:    V.1.0.0
+ * $Date:        15 August 2023
+ * $Revision:    V.1.1.0
  *
  * Target :  Arm(R) M-Profile Architecture
  *
@@ -39,20 +39,24 @@
  * @{
  */
 
-int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
+int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims)
 {
     (void)filter_dims;
     return 0;
 }
 
-int32_t arm_fully_connected_s8_get_buffer_size_dsp(const cmsis_nn_dims *filter_dims)
+int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims)
 {
-    return arm_fully_connected_s8_get_buffer_size(filter_dims);
+    return filter_dims->c * sizeof(int32_t);
 }
 
-int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_dims)
+int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims)
 {
-    return arm_fully_connected_s8_get_buffer_size(filter_dims);
+#if defined(ARM_MATH_MVEI)
+    return arm_fully_connected_s8_get_buffer_size_mve(filter_dims);
+#else
+    return arm_fully_connected_s8_get_buffer_size_dsp(filter_dims);
+#endif
 }
 
 /**
diff --git a/Source/FullyConnectedFunctions/arm_fully_connected_s8.c b/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
index 55550c01..77dbe1ff 100644
--- a/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
+++ b/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
@@ -60,15 +60,24 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
                                            int8_t *output)
 {
     (void)bias_dims;
-    (void)ctx;
     (void)fc_params->filter_offset;
 
     int32_t batch_cnt = input_dims->n;
 
+#if defined(ARM_MATH_MVEI)
+    if (ctx->buf == NULL)
+    {
+        return (ARM_CMSIS_NN_ARG_ERROR);
+    }
+#endif
+
+    const int32_t *kernel_sum = ctx->buf;
+
     while (batch_cnt)
     {
         arm_nn_vec_mat_mult_t_s8(input,
                                  kernel,
+                                 kernel_sum,
                                  bias,
                                  output,
                                  fc_params->input_offset,
@@ -80,6 +89,7 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
                                  fc_params->activation.min,
                                  fc_params->activation.max,
                                  1L);
+
         input += filter_dims->n;
         output += output_dims->c;
         batch_cnt--;
diff --git a/Source/FullyConnectedFunctions/arm_vector_sum_s8.c b/Source/FullyConnectedFunctions/arm_vector_sum_s8.c
new file mode 100644
index 00000000..0120ba1d
--- /dev/null
+++ b/Source/FullyConnectedFunctions/arm_vector_sum_s8.c
@@ -0,0 +1,144 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_vector_sum_s8
+ * Description:  Generic function for calculating vector sums
+ *
+ * $Date:        5 September 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup FC
+ * @{
+ */
+
+/*
+ * S8 vector sum fuction in preparation for e.g. kernel sums in fully connected and matrix multiplication layer function
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_vector_sum_s8(int32_t *vector_sum_buf,
+                                      const int32_t vector_cols,
+                                      const int32_t vector_rows,
+                                      const int8_t *vector_data)
+{
+#if defined(ARM_MATH_MVEI)
+    const int32_t row_loop_cnt = vector_rows / 4;
+
+    for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
+    {
+        const int32_t col_loop_cnt = (vector_cols + 15) / 16;
+
+        const int8_t *vector_0 = vector_data;
+        const int8_t *vector_1 = vector_data + vector_cols;
+        const int8_t *vector_2 = vector_data + 2 * vector_cols;
+        const int8_t *vector_3 = vector_data + 3 * vector_cols;
+
+        int32_t vector_sum_0 = 0;
+        int32_t vector_sum_1 = 0;
+        int32_t vector_sum_2 = 0;
+        int32_t vector_sum_3 = 0;
+
+        uint32_t col_cnt = (uint32_t)vector_cols;
+
+        for (int i = 0; i < col_loop_cnt; i++)
+        {
+            mve_pred16_t p = vctp8q(col_cnt);
+            col_cnt -= 16;
+
+            const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p);
+            vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0);
+
+            const int8x16_t ker_1 = vldrbq_z_s8(vector_1, p);
+            vector_sum_1 = vaddvaq_s8(vector_sum_1, ker_1);
+
+            const int8x16_t ker_2 = vldrbq_z_s8(vector_2, p);
+            vector_sum_2 = vaddvaq_s8(vector_sum_2, ker_2);
+
+            const int8x16_t ker_3 = vldrbq_z_s8(vector_3, p);
+            vector_sum_3 = vaddvaq_s8(vector_sum_3, ker_3);
+
+            vector_0 += 16;
+            vector_1 += 16;
+            vector_2 += 16;
+            vector_3 += 16;
+        }
+        vector_data += 4 * vector_cols;
+
+        vector_sum_buf[0] = vector_sum_0;
+        vector_sum_buf[1] = vector_sum_1;
+        vector_sum_buf[2] = vector_sum_2;
+        vector_sum_buf[3] = vector_sum_3;
+        vector_sum_buf += 4;
+    }
+
+    const int32_t loop_cnt = vector_rows % 4;
+
+    for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++)
+    {
+        const int32_t col_loop_cnt = (vector_cols + 15) / 16;
+
+        const int8_t *vector_0 = vector_data;
+
+        int32_t vector_sum_0 = 0;
+
+        uint32_t col_cnt = (uint32_t)vector_cols;
+
+        for (int i = 0; i < col_loop_cnt; i++)
+        {
+            mve_pred16_t p = vctp8q(col_cnt);
+            col_cnt -= 16;
+
+            const int8x16_t ker_0 = vldrbq_z_s8(vector_0, p);
+            vector_sum_0 = vaddvaq_s8(vector_sum_0, ker_0);
+
+            vector_0 += 16;
+        }
+        vector_data += vector_cols;
+
+        vector_sum_buf[i_row_loop_cnt] = vector_sum_0;
+    }
+
+    return (ARM_CMSIS_NN_SUCCESS);
+#else
+    (void)vector_sum_buf;
+    (void)vector_rows;
+    (void)vector_cols;
+    (void)vector_data;
+
+    return (ARM_CMSIS_NN_NO_IMPL_ERROR);
+#endif
+}
+
+/**
+ * @} end of FC group
+ */
diff --git a/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
index 8568676c..1287d00a 100644
--- a/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
+++ b/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
@@ -57,6 +57,7 @@
 #endif
 arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
                                              const int8_t *rhs,
+                                             const int32_t *kernel_sum,
                                              const int32_t *bias,
                                              int8_t *dst,
                                              const int32_t lhs_offset,
@@ -70,7 +71,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
                                              const int32_t address_offset)
 {
 #if defined(ARM_MATH_MVEI)
-    const int32_t row_loop_cnt = rhs_rows / 3;
+    const int32_t row_loop_cnt = rhs_rows / 4;
     const uint32x4_t address_offset_array = {0, address_offset, address_offset * 2, address_offset * 3};
 
     for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
@@ -78,6 +79,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
         int32_t acc_0 = 0;
         int32_t acc_1 = 0;
         int32_t acc_2 = 0;
+        int32_t acc_3 = 0;
 
         const int32_t col_loop_cnt = (rhs_cols + 15) / 16;
 
@@ -85,15 +87,14 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
         const int8_t *rhs_0 = rhs;
         const int8_t *rhs_1 = rhs + rhs_cols;
         const int8_t *rhs_2 = rhs + 2 * rhs_cols;
+        const int8_t *rhs_3 = rhs + 3 * rhs_cols;
 
-        int32_t rhs_sum_0 = 0;
-        int32_t rhs_sum_1 = 0;
-        int32_t rhs_sum_2 = 0;
         if (bias)
         {
             acc_0 = *bias++;
             acc_1 = *bias++;
             acc_2 = *bias++;
+            acc_3 = *bias++;
         }
 
         uint32_t col_cnt = (uint32_t)rhs_cols;
@@ -106,53 +107,48 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
             const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
 
             const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
-            rhs_sum_0 = vaddvaq_s8(rhs_sum_0, ker_0);
             acc_0 = vmladavaq_s8(acc_0, ker_0, input);
 
             const int8x16_t ker_1 = vldrbq_z_s8(rhs_1, p);
-            rhs_sum_1 = vaddvaq_s8(rhs_sum_1, ker_1);
             acc_1 = vmladavaq_s8(acc_1, ker_1, input);
 
             const int8x16_t ker_2 = vldrbq_z_s8(rhs_2, p);
-            rhs_sum_2 = vaddvaq_s8(rhs_sum_2, ker_2);
             acc_2 = vmladavaq_s8(acc_2, ker_2, input);
 
+            const int8x16_t ker_3 = vldrbq_z_s8(rhs_3, p);
+            acc_3 = vmladavaq_s8(acc_3, ker_3, input);
+
             lhs_vec += 16;
             rhs_0 += 16;
             rhs_1 += 16;
             rhs_2 += 16;
+            rhs_3 += 16;
         }
-        rhs += 3 * rhs_cols;
+        rhs += 4 * rhs_cols;
 
-        int32x4_t acc = {acc_0, acc_1, acc_2, 0};
-        const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};
+        int32x4_t acc = {acc_0, acc_1, acc_2, acc_3};
+
+        const int32x4_t rhs_sum = {kernel_sum[0], kernel_sum[1], kernel_sum[2], kernel_sum[3]};
         acc += vdupq_n_s32(lhs_offset) * rhs_sum;
+        kernel_sum += 4;
 
         acc = arm_requantize_mve(acc, dst_multiplier, dst_shift);
         acc = vaddq_s32(acc, vdupq_n_s32(dst_offset));
         acc = vmaxq_s32(acc, vdupq_n_s32(activation_min));
         acc = vminq_s32(acc, vdupq_n_s32(activation_max));
 
-        const mve_pred16_t p = vctp32q(3);
-        if (address_offset > 1L)
-        {
-            vstrbq_scatter_offset_p_s32(dst, address_offset_array, acc, p);
-        }
-        else
-        {
-            vstrbq_p_s32(dst, acc, p);
-        }
-        dst += 3 * address_offset;
+        vstrbq_scatter_offset_s32(dst, address_offset_array, acc);
+
+        dst += 4 * address_offset;
     }
 
-    const int loop_cnt = rhs_rows % 3;
+    const int loop_cnt = rhs_rows % 4;
     for (int i_row_loop_cnt = 0; i_row_loop_cnt < loop_cnt; i_row_loop_cnt++)
     {
         int32_t acc_0 = 0;
         const int32_t col_loop_cnt = (rhs_cols + 15) / 16;
         const int8_t *lhs_vec = lhs;
         const int8_t *rhs_0 = rhs;
-        int32_t rhs_sum_0 = 0;
         uint32_t col_cnt = (uint32_t)rhs_cols;
 
         for (int i = 0; i < col_loop_cnt; i++)
@@ -162,7 +158,6 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
             const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
 
             const int8x16_t ker_0 = vldrbq_z_s8(rhs_0, p);
-            rhs_sum_0 = vaddvaq_s8(rhs_sum_0, ker_0);
             acc_0 = vmladavaq_s8(acc_0, ker_0, input);
 
             lhs_vec += 16;
@@ -175,7 +170,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
             acc_0 += *bias;
             bias++;
         }
-        const int32_t offsets = rhs_sum_0 * lhs_offset;
+        const int32_t rhs_sum = kernel_sum[i_row_loop_cnt];
+        const int32_t offsets = rhs_sum * lhs_offset;
         acc_0 += offsets;
         acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
         acc_0 += dst_offset;
@@ -187,6 +183,8 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
     }
 
 #elif defined(ARM_MATH_DSP)
+    (void)kernel_sum;
+
     const int32_t row_loop_cnt = rhs_rows / 2;
     const int16_t lhs_offset_s16 = (int16_t)lhs_offset;
     const uint32_t lhs_offset_s16x2 = PKHBT(lhs_offset_s16, lhs_offset_s16, 16);
@@ -302,6 +300,7 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
     }
 
 #else
+    (void)kernel_sum;
 
     const int32_t row_loop_cnt = rhs_rows / 3;
 
diff --git a/Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c b/Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c
new file mode 100644
index 00000000..44b43757
--- /dev/null
+++ b/Source/SVDFunctions/arm_svdf_get_buffer_sizes_s8.c
@@ -0,0 +1,64 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_svdf_get_buffer_sizes_s8.c
+ * Description:  Collection of get buffer size functions for svdf s8 layer function.
+ *
+ * $Date:        5 September 2023
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+
+/**
+ *  @ingroup SVDF
+ */
+
+/**
+ * @addtogroup GetBufferSizeSVDF
+ * @{
+ */
+
+int32_t arm_svdf_s8_get_buffer_size_dsp(const cmsis_nn_dims *weights_feature_dims)
+{
+    (void)weights_feature_dims;
+    return 0;
+}
+
+int32_t arm_svdf_s8_get_buffer_size_mve(const cmsis_nn_dims *weights_feature_dims)
+{
+    return weights_feature_dims->n * sizeof(int32_t);
+}
+
+int32_t arm_svdf_s8_get_buffer_size(const cmsis_nn_dims *weights_feature_dims)
+{
+#if defined(ARM_MATH_MVEI)
+    return arm_svdf_s8_get_buffer_size_mve(weights_feature_dims);
+#else
+    return arm_svdf_s8_get_buffer_size_dsp(weights_feature_dims);
+#endif
+}
+
+/**
+ * @} end of GetBufferSizeSVDF group
+ */
diff --git a/Source/SVDFunctions/arm_svdf_s8.c b/Source/SVDFunctions/arm_svdf_s8.c
index faab5a49..a97ce3ae 100644
--- a/Source/SVDFunctions/arm_svdf_s8.c
+++ b/Source/SVDFunctions/arm_svdf_s8.c
@@ -21,8 +21,8 @@
  * Title:        arm_svdf_s8.c
  * Description:  S8 basic SVDF layer function
  *
- * $Date:        5 January 2023
- * $Revision:    V.5.1.0
+ * $Date:        5 September 2023
+ * $Revision:    V.6.0.0
  *
  * Target :  Arm(R) M-Profile Architecture
  *
@@ -47,7 +47,8 @@
  *
  */
 
-arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
+arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *ctx,
+                                const cmsis_nn_context *input_ctx,
                                 const cmsis_nn_context *output_ctx,
                                 const cmsis_nn_svdf_params *svdf_params,
                                 const cmsis_nn_per_tensor_quant_params *input_quant_params,
@@ -69,6 +70,13 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
     (void)state_dims;
     (void)output_dims;
 
+#if defined(ARM_MATH_MVEI)
+    if (ctx->buf == NULL)
+    {
+        return (ARM_CMSIS_NN_ARG_ERROR);
+    }
+#endif
+
     const int32_t multiplier_in = input_quant_params->multiplier;
     const int32_t shift_in = input_quant_params->shift;
     const int32_t multiplier_out = output_quant_params->multiplier;
@@ -99,6 +107,8 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
     }
     int32_t *buffer_b = (int32_t *)output_ctx->buf;
 
+    int32_t *kernel_sum_data = (int32_t *)ctx->buf;
+
     // Left shift state
     memmove((int8_t *)state_data,
             (int8_t *)state_data + 1,
@@ -108,11 +118,11 @@ arm_cmsis_nn_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
     for (int i_batch = 0; i_batch < input_batches; i_batch++)
     {
         int8_t *res_ptr = state_data + (time_batches * i_batch * feature_batches) + (time_batches - 1);
-        const int8_t *weight = weights_feature_data;
         const int8_t *input = input_data + i_batch * input_height;
 
         arm_cmsis_nn_status res = arm_nn_vec_mat_mult_t_s8(input,
-                                                           weight,
+                                                           weights_feature_data,
+                                                           kernel_sum_data,
                                                            NULL,
                                                            res_ptr,
                                                            -zp_in,
diff --git a/Tests/UnitTest/TestCases/TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h b/Tests/UnitTest/TestCases/TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h
new file mode 100644
index 00000000..917f9f58
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <stdint.h>
+
+// This is the output of arm_vector_sum_s8() from the ds_cnn_s model.
+int32_t ds_cnn_s_layer_12_fully_connected_kernel_sums[12] =
+    {-2931, 80, -521, -390, -576, -255, -464, -470, -486, -502, -490, -234};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/biases_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/biases_data.h
new file mode 100644
index 00000000..ce036577
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/biases_data.h
@@ -0,0 +1,6 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include <stdint.h>
+
+const int32_t svdf_int8_2_biases[13] = {-108, -78, -29, -5, 25, -113, 122, -68, -32, -57, -59, -14, 13};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/config_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/config_data.h
new file mode 100644
index 00000000..b9491339
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/config_data.h
@@ -0,0 +1,19 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#define SVDF_INT8_2_MULTIPLIER_IN 1717987072
+#define SVDF_INT8_2_MULTIPLIER_OUT 1099511552
+#define SVDF_INT8_2_SHIFT_1 -3
+#define SVDF_INT8_2_SHIFT_2 -11
+#define SVDF_INT8_2_IN_ACTIVATION_MIN -128
+#define SVDF_INT8_2_IN_ACTIVATION_MAX 127
+#define SVDF_INT8_2_RANK 2
+#define SVDF_INT8_2_FEATURE_BATCHES 26
+#define SVDF_INT8_2_TIME_BATCHES 3
+#define SVDF_INT8_2_INPUT_SIZE 40
+#define SVDF_INT8_2_DST_SIZE 26
+#define SVDF_INT8_2_OUT_ACTIVATION_MIN -128
+#define SVDF_INT8_2_OUT_ACTIVATION_MAX 127
+#define SVDF_INT8_2_INPUT_BATCHES 2
+#define SVDF_INT8_2_INPUT_OFFSET -12
+#define SVDF_INT8_2_OUTPUT_OFFSET 0
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/input_sequence_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/input_sequence_data.h
new file mode 100644
index 00000000..92b96632
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/input_sequence_data.h
@@ -0,0 +1,18 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include <stdint.h>
+
+const int8_t svdf_int8_2_input_sequence[240] = {
+    -5,  -114, -7,   -41,  22,  -57, 100,  -105, -75,  -113, 55,   -61,  -87,  -46,  -6,   28,   -105, 87,   95,   -72,
+    -85, -49,  -71,  67,   -29, -99, 22,   -47,  -78,  -92,  -114, -85,  -10,  -9,   26,   62,   -6,   -106, -96,  -117,
+    -90, 16,   -90,  -15,  63,  -3,  63,   24,   119,  -25,  -19,  38,   119,  12,   -25,  -32,  125,  3,    -127, 40,
+    -85, 27,   80,   -73,  -9,  -14, -95,  -64,  15,   -99,  88,   39,   32,   -7,   -18,  38,   -93,  -81,  -65,  108,
+    -33, -31,  -3,   -104, 103, 108, 116,  -33,  122,  -128, 90,   -28,  116,  93,   -107, 122,  118,  -10,  -124, -43,
+    -1,  12,   -48,  117,  -54, -42, 57,   106,  -48,  -12,  90,   121,  -52,  17,   -73,  -127, 34,   -72,  72,   -97,
+    2,   75,   -82,  45,   -3,  -42, 82,   -71,  -109, -75,  -101, 70,   -128, -128, -79,  -96,  61,   59,   -4,   -95,
+    -92, 77,   -7,   67,   99,  110, 49,   -102, -17,  -103, -13,  -6,   -118, -23,  -33,  -68,  -25,  -50,  68,   -68,
+    57,  120,  74,   50,   97,  36,  -114, -120, -73,  -70,  -23,  -10,  81,   22,   74,   43,   -112, 35,   122,  -65,
+    -1,  95,   69,   -27,  84,  99,  -67,  -71,  -119, 89,   43,   -107, -74,  105,  -48,  -25,  86,   109,  -92,  -29,
+    80,  44,   -61,  15,   119, 109, -28,  -67,  84,   -76,  65,   95,   60,   -110, -97,  32,   -52,  -127, -46,  78,
+    42,  116,  -113, -26,  96,  27,  93,   -70,  -41,  -11,  34,   53,   -39,  126,  -65,  -28,  -83,  4,    14,   -98};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/output_ref_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/output_ref_data.h
new file mode 100644
index 00000000..87d60d88
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/output_ref_data.h
@@ -0,0 +1,7 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include <stdint.h>
+
+const int8_t svdf_int8_2_output_ref[26] = {5,  -9, 0, 7, 2, -3, 5, 0,  2, -10, 1, -5, -1,
+                                           -5, -9, 0, 1, 3, -2, 9, -3, 5, 4,   4, 7,  9};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/state_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/state_data.h
new file mode 100644
index 00000000..8653560e
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/state_data.h
@@ -0,0 +1,11 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include <stdint.h>
+
+const int8_t svdf_int8_2_state[156] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/test_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/test_data.h
new file mode 100644
index 00000000..0d467e1d
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/test_data.h
@@ -0,0 +1,9 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_sequence_data.h"
+#include "output_ref_data.h"
+#include "state_data.h"
+#include "weights_feature_data.h"
+#include "weights_time_data.h"
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_feature_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_feature_data.h
new file mode 100644
index 00000000..b91c1b1f
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_feature_data.h
@@ -0,0 +1,61 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include <stdint.h>
+
+const int8_t svdf_int8_2_weights_feature[1040] = {
+    -94,  -100, -91,  -31,  60,   -96,  -93,  84,   16,   -64,  -79,  -109, 48,   65,   118,  117,  -110, -19, 29,
+    83,   21,   27,   -85,  -40,  -16,  31,   52,   79,   91,   14,   -62,  100,  30,   6,    -83,  -44,  -75, -44,
+    -45,  -108, 7,    52,   15,   -101, -30,  -106, -62,  20,   -51,  102,  74,   -75,  54,   -112, -1,   -99, 122,
+    -113, -33,  -29,  84,   102,  -125, -31,  -66,  -34,  -61,  -80,  3,    66,   -84,  -112, -116, 19,   88,  -113,
+    -37,  -3,   -82,  78,   22,   105,  -97,  117,  -79,  99,   78,   -61,  68,   106,  99,   12,   20,   68,  16,
+    -26,  -2,   104,  108,  115,  -59,  -78,  -32,  78,   -51,  -13,  51,   -81,  -7,   74,   55,   -13,  -71, 48,
+    -14,  -37,  43,   -27,  -115, -75,  -22,  92,   99,   11,   -31,  -23,  24,   47,   41,   -29,  -71,  -78, -92,
+    0,    13,   36,   -32,  -47,  15,   37,   -37,  -76,  68,   -105, 28,   91,   107,  123,  -40,  123,  96,  -43,
+    13,   -113, 55,   -124, -31,  -25,  -118, 92,   66,   -13,  43,   65,   -108, 93,   47,   71,   -57,  -85, 3,
+    -95,  100,  18,   60,   -92,  43,   79,   -94,  106,  -26,  -66,  -72,  -74,  -8,   47,   -57,  -49,  13,  -69,
+    28,   62,   90,   18,   -52,  6,    -70,  22,   109,  -114, 18,   -32,  -108, 16,   -31,  101,  -103, 106, 122,
+    115,  -94,  35,   -31,  -105, -26,  -107, 58,   -19,  99,   124,  27,   -16,  -10,  48,   18,   65,   -30, 56,
+    119,  -40,  -107, -27,  -83,  107,  117,  56,   -30,  -19,  68,   117,  29,   -95,  -95,  76,   65,   19,  105,
+    -51,  87,   -62,  91,   -84,  -113, 106,  -116, 57,   67,   71,   -23,  -79,  -61,  -56,  -100, -86,  -74, 114,
+    -50,  -110, 107,  -70,  -47,  -65,  56,   90,   -86,  8,    1,    -39,  41,   22,   24,   -87,  -104, 43,  -17,
+    -102, -114, 87,   -117, -106, -101, 4,    78,   -64,  115,  88,   70,   -66,  -42,  -46,  80,   -45,  -78, -43,
+    113,  -31,  58,   -89,  49,   -83,  -111, -126, 126,  100,  -87,  -13,  122,  -94,  -90,  -91,  -13,  5,   105,
+    -75,  25,   -12,  -3,   101,  -72,  -55,  54,   112,  49,   0,    92,   -114, -75,  -16,  20,   -24,  29,  -34,
+    114,  101,  -69,  -92,  68,   -25,  95,   117,  87,   71,   106,  42,   -108, -118, 21,   112,  -10,  43,  -71,
+    -65,  112,  61,   40,   2,    76,   21,   32,   7,    26,   -101, 70,   83,   29,   44,   -89,  44,   58,  104,
+    54,   111,  17,   42,   6,    40,   63,   -61,  41,   103,  30,   -52,  0,    -118, -3,   0,    -74,  62,  3,
+    91,   90,   46,   88,   62,   -97,  73,   55,   -116, -104, 57,   114,  -104, 71,   -80,  61,   -120, -18, 48,
+    -22,  24,   14,   110,  86,   60,   -60,  -80,  -110, 101,  -22,  -38,  -3,   -30,  66,   28,   -64,  101, -79,
+    31,   39,   -82,  -58,  111,  33,   73,   -21,  123,  -104, -35,  -111, -70,  24,   118,  48,   -60,  -11, -70,
+    18,   27,   106,  56,   82,   44,   -43,  -13,  103,  84,   -114, 56,   -38,  -4,   15,   -90,  -17,  -23, -58,
+    -21,  -112, 13,   -118, -5,   32,   -81,  -78,  55,   -26,  96,   -66,  -63,  -94,  -120, 66,   99,   15,  -126,
+    97,   49,   38,   -22,  97,   -36,  -111, -106, 86,   -75,  87,   89,   -107, -56,  -36,  126,  29,   40,  35,
+    12,   -122, -35,  -42,  21,   -99,  -27,  5,    45,   56,   -38,  37,   -88,  -96,  -116, 109,  90,   101, 43,
+    38,   -123, 102,  -109, -41,  -22,  -113, 90,   43,   -12,  -126, 27,   82,   -76,  -29,  -80,  10,   21,  53,
+    87,   -3,   30,   99,   -55,  32,   -72,  -5,   -116, 2,    -70,  122,  51,   50,   45,   -14,  -85,  89,  13,
+    62,   108,  42,   22,   -126, 117,  -28,  -127, -3,   -26,  100,  -63,  -65,  17,   41,   6,    100,  -93, 9,
+    115,  12,   20,   31,   14,   -9,   81,   -81,  -87,  37,   -96,  -72,  15,   52,   82,   -121, -1,   -26, -120,
+    -48,  121,  -21,  -107, -97,  44,   56,   -48,  -52,  -105, 101,  53,   56,   -11,  77,   124,  102,  67,  -19,
+    120,  -67,  -85,  -2,   76,   24,   9,    -124, -54,  -62,  -73,  110,  -96,  -64,  112,  10,   44,   78,  -59,
+    -82,  111,  35,   -39,  19,   -25,  -16,  -28,  -39,  -108, 93,   -31,  98,   -88,  33,   82,   62,   112, -121,
+    -109, 104,  66,   93,   -46,  11,   52,   31,   -58,  -70,  -93,  99,   -83,  -56,  118,  -109, 5,    25,  111,
+    -30,  -100, -69,  -22,  -26,  97,   -78,  76,   21,   24,   83,   -77,  80,   -42,  111,  -10,  -113, 104, 112,
+    -77,  92,   -82,  106,  125,  17,   -81,  -1,   -86,  -102, 1,    63,   -46,  -105, 84,   55,   22,   -53, -18,
+    110,  15,   35,   5,    -40,  81,   71,   100,  93,   -3,   81,   -53,  66,   -57,  56,   -40,  -49,  -42, 20,
+    -100, 43,   93,   -13,  89,   -121, -101, 113,  20,   74,   -86,  -77,  93,   -109, 6,    15,   91,   -75, -66,
+    31,   28,   62,   41,   -95,  -119, -4,   -104, 34,   2,    -26,  -118, 22,   -62,  -102, 19,   -73,  29,  -126,
+    -79,  -96,  52,   97,   83,   -11,  -23,  61,   60,   -77,  7,    71,   -36,  106,  -93,  -1,   -126, -64, 35,
+    -124, 37,   90,   -59,  6,    2,    -2,   2,    7,    11,   -83,  -117, -67,  95,   -25,  43,   105,  -20, -9,
+    -38,  84,   108,  -55,  -115, -46,  -68,  112,  -93,  109,  -61,  -32,  -62,  41,   -37,  28,   -13,  8,   -51,
+    114,  43,   39,   39,   -37,  17,   53,   126,  41,   79,   99,   -80,  -96,  -15,  64,   85,   -124, 94,  -6,
+    19,   -54,  -83,  -66,  78,   54,   16,   -108, 69,   -65,  -74,  -6,   120,  -121, -38,  -85,  -12,  5,   20,
+    -91,  33,   -63,  90,   -84,  -21,  111,  110,  98,   -49,  -125, -74,  48,   -52,  -98,  102,  64,   -87, -39,
+    16,   75,   82,   40,   -39,  -96,  8,    -92,  101,  82,   38,   -46,  -83,  -76,  99,   -79,  28,   100, 76,
+    123,  81,   -61,  39,   94,   -87,  121,  112,  59,   -13,  48,   21,   -17,  -17,  -81,  91,   -52,  57,  -117,
+    -100, 99,   92,   95,   8,    -83,  -68,  -62,  -21,  100,  -4,   -23,  -42,  36,   -94,  58,   5,    35,  -70,
+    45,   126,  112,  75,   110,  -73,  84,   70,   -108, -83,  -36,  -11,  -110, -79,  -34,  -18,  11,   -84, 53,
+    84,   10,   49,   -17,  -62,  -72,  103,  122,  -66,  102,  -7,   71,   5,    -78,  -49,  91,   39,   -98, -74,
+    71,   74,   -12,  17,   1,    -98,  -28,  11,   12,   -91,  -69,  15,   3,    -104, 109,  12,   48,   -64, -56,
+    -5,   -33,  -74,  -33,  -8,   -63,  88,   -126, -61,  70,   -98,  -12,  15,   19,   29,   -107, 107,  -8,  -24,
+    55,   26,   119,  -43,  87,   4,    48,   -11,  -126, -40,  107,  122,  -92,  -27};
diff --git a/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_time_data.h b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_time_data.h
new file mode 100644
index 00000000..e3eff344
--- /dev/null
+++ b/Tests/UnitTest/TestCases/TestData/svdf_int8_2/weights_time_data.h
@@ -0,0 +1,10 @@
+// Generated by test_settings.py using tensorflow version 2.10.0 (Keras version 2.10.0).
+// Interpreter from tflite_micro version 0.dev20230817002213-g3bd11ea3 and revision None.
+#pragma once
+#include <stdint.h>
+
+const int8_t svdf_int8_2_weights_time[78] = {
+    98,   57,   10,  25, 18,   -46, 8,   -55, -128, 57, 49,  -13,  3,    -5,   101, -128, -13, 45,  67,  13,
+    -49,  111,  73,  48, -113, -17, -83, -49, 71,   4,  -74, 86,   54,   -29,  6,   -20,  94,  124, 113, -69,
+    34,   21,   110, 71, 68,   8,   80,  38,  -94,  82, -20, 4,    10,   -124, 86,  110,  -33, -3,  12,  -99,
+    -116, -107, 74,  45, -44,  51,  114, 78,  -116, 82, -48, -100, -107, 84,   110, -118, 32,  105};
diff --git a/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c b/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c
index 9d26e64c..1e6eafd9 100644
--- a/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_ds_cnn_l_s8/test_arm_ds_cnn_l_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -469,6 +469,10 @@ void ds_cnn_l_s8_inference(void)
 
     bias_dims.c = in_out_dim_1.c;
 
+#if defined(ARM_MATH_MVEI)
+    arm_vector_sum_s8(ctx.buf, conv_filter_dims.n, in_out_dim_1.c, ds_cnn_l_layer_14_fully_connected_weights);
+#endif
+
     status |= arm_fully_connected_s8(&ctx,
                                      &fc_params,
                                      &per_tensor_quant_params,
diff --git a/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c b/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c
index 21fb972d..5862df7b 100644
--- a/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_ds_cnn_s_s8/test_arm_ds_cnn_s_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -19,6 +19,7 @@
 #include "arm_nnfunctions.h"
 #include "unity.h"
 
+#include "../TestData/ds_cnn_s/layer_12_fully_connected_kernel_sums_data.h"
 #include "../TestData/ds_cnn_s/test_data.h"
 #include "../Utils/validate.h"
 
@@ -107,6 +108,7 @@ void ds_cnn_s_s8_inference(void)
     /* Test for a complete int8 DS_CNN_S keyword spotting network from https://github.com/ARM-software/ML-zoo &
      * Tag: 22.02 */
     cmsis_nn_context ctx;
+    cmsis_nn_context ctx_kernel_sum;
     const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
 
     ctx.size = ds_cnn_s_s8_get_buffer_size();
@@ -408,7 +410,13 @@ void ds_cnn_s_s8_inference(void)
 
     bias_dims.c = in_out_dim_1.c;
 
-    status |= arm_fully_connected_s8(&ctx,
+#if defined(ARM_MATH_MVEI)
+    ctx_kernel_sum.buf = ds_cnn_s_layer_12_fully_connected_kernel_sums;
+#else
+    ctx_kernel_sum = ctx;
+#endif
+
+    status |= arm_fully_connected_s8(&ctx_kernel_sum,
                                      &fc_params,
                                      &per_tensor_quant_params,
                                      &in_out_dim_0,
diff --git a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c
index 75393c34..00575a14 100644
--- a/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_fully_connected_s8/test_arm_fully_connected_s8.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -64,10 +64,15 @@ void fully_connected_arm_fully_connected_s8(void)
     quant_params.multiplier = FULLY_CONNECTED_OUTPUT_MULTIPLIER;
     quant_params.shift = FULLY_CONNECTED_OUTPUT_SHIFT;
 
-    int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+    const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
     ctx.buf = malloc(buf_size);
     ctx.size = buf_size;
 
+#if defined(ARM_MATH_MVEI)
+    int32_t *buf = ctx.buf;
+    TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
     arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
                                                         &fc_params,
                                                         &quant_params,
@@ -122,9 +127,15 @@ void fully_connected_mve_0_arm_fully_connected_s8(void)
     quant_params.multiplier = FULLY_CONNECTED_MVE_0_OUTPUT_MULTIPLIER;
     quant_params.shift = FULLY_CONNECTED_MVE_0_OUTPUT_SHIFT;
 
-    int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+    const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
     ctx.buf = malloc(buf_size);
     ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+    int32_t *buf = ctx.buf;
+    TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
     arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
                                                         &fc_params,
                                                         &quant_params,
@@ -178,9 +189,15 @@ void fully_connected_mve_1_arm_fully_connected_s8(void)
     quant_params.multiplier = FULLY_CONNECTED_MVE_1_OUTPUT_MULTIPLIER;
     quant_params.shift = FULLY_CONNECTED_MVE_1_OUTPUT_SHIFT;
 
-    int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+    const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
     ctx.buf = malloc(buf_size);
     ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+    int32_t *buf = ctx.buf;
+    TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
     arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
                                                         &fc_params,
                                                         &quant_params,
@@ -245,9 +262,15 @@ void fully_connected_null_bias_0_arm_fully_connected_s8(void)
     }
     TEST_ASSERT_EQUAL(expected, ip_check);
 
-    int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+    const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
     ctx.buf = malloc(buf_size);
     ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+    int32_t *buf = ctx.buf;
+    TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
     arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
                                                         &fc_params,
                                                         &quant_params,
@@ -301,9 +324,15 @@ void fully_connected_out_activation_arm_fully_connected_s8(void)
     quant_params.multiplier = FULLY_CONNECTED_OUT_ACTIVATION_OUTPUT_MULTIPLIER;
     quant_params.shift = FULLY_CONNECTED_OUT_ACTIVATION_OUTPUT_SHIFT;
 
-    int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
+    const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
     ctx.buf = malloc(buf_size);
     ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+    int32_t *buf = ctx.buf;
+    TEST_ASSERT_EQUAL(expected, arm_vector_sum_s8(buf, filter_dims.n, output_dims.c, kernel_data));
+#endif
+
     arm_cmsis_nn_status result = arm_fully_connected_s8(&ctx,
                                                         &fc_params,
                                                         &quant_params,
diff --git a/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c b/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
index ff493a4f..6db62bf9 100644
--- a/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_svdf_s8/Unity/unity_test_arm_svdf_s8.c
@@ -45,3 +45,4 @@ void setUp(void)
 void tearDown(void) {}
 
 void test_svdf_int8_arm_s8(void) { svdf_int8_arm_svdf_s8(); }
+void test_svdf_int8_2_arm_s8(void) { svdf_int8_2_arm_svdf_s8(); }
diff --git a/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c b/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
index b71d5a5d..43f7d26e 100644
--- a/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
+++ b/Tests/UnitTest/TestCases/test_arm_svdf_s8/test_arm_svdf_s8.c
@@ -20,6 +20,7 @@
 #include <arm_nnfunctions.h>
 
 #include "../TestData/svdf_int8/test_data.h"
+#include "../TestData/svdf_int8_2/test_data.h"
 #include "../Utils/validate.h"
 
 #define REPEAT_NUM (1)
@@ -68,6 +69,16 @@ void svdf_int8_arm_svdf_s8(void)
     const int scratch_size = SVDF_INT8_INPUT_BATCHES * SVDF_INT8_FEATURE_BATCHES * sizeof(int32_t);
     const int scratch_size_out = SVDF_INT8_INPUT_BATCHES * number_units * sizeof(int32_t);
 
+    cmsis_nn_context ctx;
+    const int32_t buf_size = arm_svdf_s8_get_buffer_size(&weights_feature_dims);
+    ctx.buf = malloc(buf_size);
+    ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+    int32_t *kernel_sum_buf = ctx.buf;
+    arm_vector_sum_s8(kernel_sum_buf, input_dims.h, weights_feature_dims.n, weights_feature_data);
+#endif
+
     // + SVDF_INT8_TIME_BATCHES additional bytes to make sure it is not overwritten
     const int state_data_size = sizeof(svdf_int8_state) + SVDF_INT8_TIME_BATCHES;
     const int8_t initial_data = 66;
@@ -86,7 +97,8 @@ void svdf_int8_arm_svdf_s8(void)
         for (int j = 0; j < number_inputs; j++)
         {
             memcpy(input_data, svdf_int8_input_sequence + j * input_round_size, input_round_size);
-            arm_cmsis_nn_status result = arm_svdf_s8(&input_ctx,
+            arm_cmsis_nn_status result = arm_svdf_s8(&ctx,
+                                                     &input_ctx,
                                                      &output_ctx,
                                                      &svdf_int8_params,
                                                      &input_quant_params,
@@ -109,6 +121,13 @@ void svdf_int8_arm_svdf_s8(void)
         TEST_ASSERT_TRUE(validate(output_data, output_ref, output_ref_size));
     }
 
+    if (ctx.buf)
+    {
+        // The caller is responsible to clear the scratch buffers for security reasons if applicable.
+        memset(ctx.buf, 0, buf_size);
+        free(ctx.buf);
+    }
+
     // Make sure state data is not written outside boundary
     for (int i = sizeof(svdf_int8_state); i < state_data_size; i++)
     {
@@ -120,3 +139,108 @@ void svdf_int8_arm_svdf_s8(void)
     free(input_ctx.buf);
     free(output_ctx.buf);
 }
+
+void svdf_int8_2_arm_svdf_s8(void)
+{
+    const int32_t output_ref_size = SVDF_INT8_2_DST_SIZE;
+    const int8_t *output_ref = svdf_int8_2_output_ref;
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    cmsis_nn_context input_ctx;
+    cmsis_nn_context output_ctx;
+    cmsis_nn_svdf_params svdf_int8_2_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims weights_feature_dims;
+    cmsis_nn_dims weights_time_dims;
+    cmsis_nn_dims state_dims;
+    cmsis_nn_dims output_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_per_tensor_quant_params input_quant_params;
+    cmsis_nn_per_tensor_quant_params output_quant_params;
+    int8_t output_data[SVDF_INT8_2_DST_SIZE] = {1};
+    const int8_t *weights_feature_data = svdf_int8_2_weights_feature;
+    const int8_t *weights_time_data = svdf_int8_2_weights_time;
+
+    input_dims.n = SVDF_INT8_2_INPUT_BATCHES;
+    input_dims.h = SVDF_INT8_2_INPUT_SIZE;
+    weights_feature_dims.n = SVDF_INT8_2_FEATURE_BATCHES;
+    weights_time_dims.h = SVDF_INT8_2_TIME_BATCHES;
+
+    input_quant_params.multiplier = SVDF_INT8_2_MULTIPLIER_IN;
+    input_quant_params.shift = SVDF_INT8_2_SHIFT_1;
+    output_quant_params.multiplier = SVDF_INT8_2_MULTIPLIER_OUT;
+    output_quant_params.shift = SVDF_INT8_2_SHIFT_2;
+
+    svdf_int8_2_params.input_activation.min = SVDF_INT8_2_IN_ACTIVATION_MIN;
+    svdf_int8_2_params.input_activation.max = SVDF_INT8_2_IN_ACTIVATION_MAX;
+    svdf_int8_2_params.output_activation.min = SVDF_INT8_2_OUT_ACTIVATION_MIN;
+    svdf_int8_2_params.output_activation.max = SVDF_INT8_2_OUT_ACTIVATION_MAX;
+    svdf_int8_2_params.input_offset = SVDF_INT8_2_INPUT_OFFSET;
+    svdf_int8_2_params.output_offset = SVDF_INT8_2_OUTPUT_OFFSET;
+    svdf_int8_2_params.rank = SVDF_INT8_2_RANK;
+
+    const int input_round_size = SVDF_INT8_2_INPUT_BATCHES * SVDF_INT8_2_INPUT_SIZE;
+    const int number_inputs = sizeof(svdf_int8_2_input_sequence) / input_round_size;
+    const int32_t number_units = SVDF_INT8_2_FEATURE_BATCHES / SVDF_INT8_2_RANK;
+    const int scratch_size = SVDF_INT8_2_INPUT_BATCHES * SVDF_INT8_2_FEATURE_BATCHES * sizeof(int32_t);
+    const int scratch_size_out = SVDF_INT8_2_INPUT_BATCHES * number_units * sizeof(int32_t);
+
+    cmsis_nn_context ctx;
+    const int32_t buf_size = arm_svdf_s8_get_buffer_size(&weights_feature_dims);
+    ctx.buf = malloc(buf_size);
+    ctx.size = buf_size;
+
+#if defined(ARM_MATH_MVEI)
+    int32_t *kernel_sum_buf = ctx.buf;
+    arm_vector_sum_s8(kernel_sum_buf, input_dims.h, weights_feature_dims.n, weights_feature_data);
+#endif
+
+    const int state_data_size = sizeof(svdf_int8_2_state);
+
+    input_ctx.buf = malloc(scratch_size);
+    output_ctx.buf = malloc(scratch_size_out);
+
+    int8_t *input_data = malloc(input_round_size);
+    int8_t *state_data = malloc(state_data_size);
+
+    for (int i = 0; i < REPEAT_NUM; i++)
+    {
+        memcpy(state_data, svdf_int8_2_state, sizeof(svdf_int8_2_state));
+        for (int j = 0; j < number_inputs; j++)
+        {
+            memcpy(input_data, svdf_int8_2_input_sequence + j * input_round_size, input_round_size);
+            arm_cmsis_nn_status result = arm_svdf_s8(&ctx,
+                                                     &input_ctx,
+                                                     &output_ctx,
+                                                     &svdf_int8_2_params,
+                                                     &input_quant_params,
+                                                     &output_quant_params,
+                                                     &input_dims,
+                                                     input_data,
+                                                     &state_dims,
+                                                     state_data,
+                                                     &weights_feature_dims,
+                                                     weights_feature_data,
+                                                     &weights_time_dims,
+                                                     weights_time_data,
+                                                     &bias_dims,
+                                                     svdf_int8_2_biases,
+                                                     &output_dims,
+                                                     output_data);
+            TEST_ASSERT_EQUAL(expected, result);
+        }
+
+        TEST_ASSERT_TRUE(validate(output_data, output_ref, output_ref_size));
+    }
+
+    if (ctx.buf)
+    {
+        // The caller is responsible to clear the scratch buffers for security reasons if applicable.
+        memset(ctx.buf, 0, buf_size);
+        free(ctx.buf);
+    }
+
+    free(state_data);
+    free(input_data);
+    free(input_ctx.buf);
+    free(output_ctx.buf);
+}
diff --git a/Tests/UnitTest/generate_test_data.py b/Tests/UnitTest/generate_test_data.py
index cbc154e4..fe3f0775 100755
--- a/Tests/UnitTest/generate_test_data.py
+++ b/Tests/UnitTest/generate_test_data.py
@@ -1724,6 +1724,22 @@ def load_testdata_sets(regenerate_input, regenerate_weights, regenerate_biases,
                                           generate_bias=False,
                                           int8_time_weights=True,
                                           interpreter=interpreter)
+    dataset = 'svdf_int8_2'
+    testdata_sets[dataset] = SVDFSettings(dataset,
+                                          type_of_test,
+                                          regenerate_weights,
+                                          regenerate_input,
+                                          regenerate_biases,
+                                          schema_file,
+                                          batches=2,
+                                          number_inputs=3,
+                                          rank=2,
+                                          memory_size=3,
+                                          input_size=40,
+                                          number_units=13,
+                                          input_zp=-12,
+                                          int8_time_weights=True,
+                                          interpreter=interpreter)
 
     type_of_test = 'add'
     dataset = 'add'