diff --git a/bangc-ops/kernels/kernel_wrapper/lib/libextops.a b/bangc-ops/kernels/kernel_wrapper/lib/libextops.a
index 4500375a5..8498a04eb 100644
Binary files a/bangc-ops/kernels/kernel_wrapper/lib/libextops.a and b/bangc-ops/kernels/kernel_wrapper/lib/libextops.a differ
diff --git a/bangc-ops/kernels/kernel_wrapper/wrapper.h b/bangc-ops/kernels/kernel_wrapper/wrapper.h
index 1d2790a56..7eb651fd8 100644
--- a/bangc-ops/kernels/kernel_wrapper/wrapper.h
+++ b/bangc-ops/kernels/kernel_wrapper/wrapper.h
@@ -178,6 +178,73 @@
     const mluOpTensorDescriptor_t,                                            \
     void *
 
+#define SYNCBATCHNORMSTATS_PARAM_TYPE                                       \
+  mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, const float,  \
+      const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \
+      void *
+
+#define SYNCBATCHNORMSTATS_V2_PARAM_TYPE                                      \
+  mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, void *, size_t, \
+      const float, const mluOpTensorDescriptor_t, void *,                     \
+      const mluOpTensorDescriptor_t, void *
+
+#define SYNCBATCHNORMGATHERSTATSWITHCOUNTS_PARAM_TYPE                       \
+  mluOpHandle_t, const mluOpTensorDescriptor_t, const void *,               \
+      const mluOpTensorDescriptor_t, const void *,                          \
+      const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \
+      void *, float, float, const mluOpTensorDescriptor_t, const void *,    \
+      const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \
+      void *
+
+#define SYNCBATCHNORMELEMT_PARAM_TYPE                         \
+  mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, void *
+
+#define SYNCBATCHNORMBACKWADREDUCE_PARAM_TYPE                               \
+  mluOpHandle_t, const mluOpTensorDescriptor_t, const void *,               \
+      const mluOpTensorDescriptor_t, const void *,                          \
+      const mluOpTensorDescriptor_t, const void *,                          \
+      const mluOpTensorDescriptor_t, const void *,                          \
+      const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \
+      void *, const mluOpTensorDescriptor_t, void *,                        \
+      const mluOpTensorDescriptor_t, void *, const bool, const bool,        \
+      const bool
+
+#define SYNCBATCHNORMBACKWADREDUCE_V2_PARAM_TYPE                            \
+  mluOpHandle_t, const mluOpTensorDescriptor_t, const void *,               \
+      const mluOpTensorDescriptor_t, const void *,                          \
+      const mluOpTensorDescriptor_t, const void *,                          \
+      const mluOpTensorDescriptor_t, const void *, void *, size_t,          \
+      const mluOpTensorDescriptor_t, void *, const mluOpTensorDescriptor_t, \
+      void *, const mluOpTensorDescriptor_t, void *,                        \
+      const mluOpTensorDescriptor_t, void *, const bool, const bool,        \
+      const bool
+
+#define SYNCBATCHNORMBACKWARDELEMT_PARAM_TYPE                 \
+  mluOpHandle_t, const mluOpTensorDescriptor_t, const void *, \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, const void *,            \
+      const mluOpTensorDescriptor_t, void *
+
+#define SYNCBATCHNORMBACKWARDELEMT_V2_PARAM_TYPE                 \
+  mluOpHandle_t, const mluOpTensorDescriptor_t, const void *,    \
+    const mluOpTensorDescriptor_t,                               \
+      const void *, const mluOpTensorDescriptor_t, const void *, \
+      const mluOpTensorDescriptor_t, const void *,               \
+      const mluOpTensorDescriptor_t, const void *,               \
+      const mluOpTensorDescriptor_t, const void *,               \
+      const mluOpTensorDescriptor_t, const void *,               \
+      const mluOpTensorDescriptor_t, const void *,               \
+      const mluOpTensorDescriptor_t, void *diff_x
+
 /* Kernel register */
 KERNEL_REGISTER(addN, ADDN_PARAM_TYPE);
 KERNEL_REGISTER(addNV2, ADDNV2_PARAM_TYPE);
@@ -203,4 +270,17 @@ KERNEL_REGISTER(RoiAlignBackward, ROIALIGNBACKWARD_PARAM_TYPE);
 KERNEL_REGISTER(RoiAlignBackwardV2, ROIALIGNBACKWARD_V2_PARAM_TYPE);
 KERNEL_REGISTER(RoiPoolingForward, ROIPOOLINGFORWARD_PARAM_TYPE);
 KERNEL_REGISTER(RoiPoolingBackward, ROIPOOLINGBACKWARD_PARAM_TYPE);
+KERNEL_REGISTER(SyncBatchNormStats, SYNCBATCHNORMSTATS_PARAM_TYPE);
+KERNEL_REGISTER(SyncBatchNormStatsV2, SYNCBATCHNORMSTATS_V2_PARAM_TYPE);
+KERNEL_REGISTER(SyncBatchNormGatherStatsWithCounts,
+                SYNCBATCHNORMGATHERSTATSWITHCOUNTS_PARAM_TYPE);
+KERNEL_REGISTER(SyncBatchNormElemt, SYNCBATCHNORMELEMT_PARAM_TYPE);
+KERNEL_REGISTER(SyncBatchnormBackwardReduce,
+        SYNCBATCHNORMBACKWADREDUCE_PARAM_TYPE);
+KERNEL_REGISTER(SyncBatchnormBackwardReduceV2,
+        SYNCBATCHNORMBACKWADREDUCE_V2_PARAM_TYPE);
+KERNEL_REGISTER(SyncBatchNormBackwardElemt,
+        SYNCBATCHNORMBACKWARDELEMT_PARAM_TYPE);
+KERNEL_REGISTER(SyncBatchNormBackwardElemtV2,
+        SYNCBATCHNORMBACKWARDELEMT_V2_PARAM_TYPE);
 #endif  // KERNELS_KERNEL_WRAPPER_WRAPPER_H
diff --git a/bangc-ops/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp b/bangc-ops/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
new file mode 100644
index 000000000..73b31d5c6
--- /dev/null
+++ b/bangc-ops/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
@@ -0,0 +1,49 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "kernels/kernel_wrapper/wrapper.h"
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardElemt(
+    mluOpHandle_t handle,
+    const mluOpTensorDescriptor_t diff_y_desc,
+    const void *diff_y,
+    const mluOpTensorDescriptor_t x_desc,
+    const void *x,
+    const mluOpTensorDescriptor_t mean_desc,
+    const void *mean,
+    const mluOpTensorDescriptor_t invstd_desc,
+    const void *invstd,
+    const mluOpTensorDescriptor_t filter_desc,
+    const void *filter,
+    const mluOpTensorDescriptor_t mean_dy_desc,
+    const void *mean_dy,
+    const mluOpTensorDescriptor_t mean_dy_xmu_desc,
+    const void *mean_dy_xmu,
+    const mluOpTensorDescriptor_t diff_x_desc,
+    void *diff_x) {
+  SyncBatchNormBackwardElemtWrapper wrapper;
+  mluOpStatus_t ret = wrapper.invoke(handle, diff_y_desc, diff_y, x_desc,
+        x, mean_desc, mean, invstd_desc, invstd, filter_desc, filter,
+        mean_dy_desc, mean_dy, mean_dy_xmu_desc, mean_dy_xmu, diff_x_desc,
+        diff_x);
+  return ret;
+}
diff --git a/bangc-ops/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp b/bangc-ops/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
new file mode 100644
index 000000000..f78d67f92
--- /dev/null
+++ b/bangc-ops/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
@@ -0,0 +1,42 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "kernels/kernel_wrapper/wrapper.h"
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardElemtV2(
+    mluOpHandle_t handle, const mluOpTensorDescriptor_t diff_y_desc,
+    const void *diff_y, const mluOpTensorDescriptor_t x_desc, const void *x,
+    const mluOpTensorDescriptor_t mean_desc, const void *mean,
+    const mluOpTensorDescriptor_t invstd_desc, const void *invstd,
+    const mluOpTensorDescriptor_t filter_desc, const void *filter,
+    const mluOpTensorDescriptor_t sum_dy_desc, const void *sum_dy,
+    const mluOpTensorDescriptor_t sum_dy_xmu_desc, const void *sum_dy_xmu,
+    const mluOpTensorDescriptor_t count_desc, const void *count,
+    const mluOpTensorDescriptor_t diff_x_desc, void *diff_x) {
+  SyncBatchNormBackwardElemtV2Wrapper wrapper;
+  mluOpStatus_t ret = wrapper.invoke(
+      handle, diff_y_desc, diff_y, x_desc, x, mean_desc, mean, invstd_desc,
+      invstd, filter_desc, filter, sum_dy_desc, sum_dy, sum_dy_xmu_desc,
+      sum_dy_xmu, count_desc, count, diff_x_desc, diff_x);
+  return ret;
+}
+
diff --git a/bangc-ops/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/bangc-ops/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
new file mode 100644
index 000000000..f94c8bd24
--- /dev/null
+++ b/bangc-ops/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
@@ -0,0 +1,65 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "kernels/kernel_wrapper/wrapper.h"
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce(
+    mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz,
+    const mluOpTensorDescriptor_t desc_x, const void *x,
+    const mluOpTensorDescriptor_t desc_mean, const void *mean,
+    const mluOpTensorDescriptor_t desc_invstd, const void *invstd,
+    const mluOpTensorDescriptor_t desc_dfilter, void *dfilter,
+    const mluOpTensorDescriptor_t desc_dbias, void *dbias,
+    const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy,
+    const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu,
+    const bool needs_input_grad0, const bool needs_input_grad1,
+    const bool needs_input_grad2) {
+  SyncBatchnormBackwardReduceWrapper wrapper;
+  mluOpStatus_t ret =
+      wrapper.invoke(handle, desc_dz, dz, desc_x, x, desc_mean, mean,
+                     desc_invstd, invstd, desc_dfilter, dfilter, desc_dbias,
+                     dbias, desc_sum_dy, sum_dy, desc_sum_dy_xmu, sum_dy_xmu,
+                     needs_input_grad0, needs_input_grad1, needs_input_grad2);
+  return ret;
+}
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce_v2(
+    mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz,
+    const mluOpTensorDescriptor_t desc_x, const void *x,
+    const mluOpTensorDescriptor_t desc_mean, const void *mean,
+    const mluOpTensorDescriptor_t desc_invstd, const void *invstd,
+    void *workspace, size_t workspace_size,
+    const mluOpTensorDescriptor_t desc_dfilter, void *dfilter,
+    const mluOpTensorDescriptor_t desc_dbias, void *dbias,
+    const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy,
+    const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu,
+    const bool needs_input_grad0, const bool needs_input_grad1,
+    const bool needs_input_grad2) {
+  SyncBatchnormBackwardReduceV2Wrapper wrapper;
+  mluOpStatus_t ret = wrapper.invoke(
+      handle, desc_dz, dz, desc_x, x, desc_mean, mean, desc_invstd, invstd,
+      workspace, workspace_size, desc_dfilter, dfilter, desc_dbias, dbias,
+      desc_sum_dy, sum_dy, desc_sum_dy_xmu, sum_dy_xmu, needs_input_grad0,
+      needs_input_grad1, needs_input_grad2);
+  return ret;
+}
+
diff --git a/bangc-ops/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp b/bangc-ops/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
new file mode 100644
index 000000000..3de59c38f
--- /dev/null
+++ b/bangc-ops/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
@@ -0,0 +1,38 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "kernels/kernel_wrapper/wrapper.h"
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormElemt(
+    mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x,
+    const mluOpTensorDescriptor_t mean_desc, const void *mean,
+    const mluOpTensorDescriptor_t invstd_desc, const void *invstd,
+    const mluOpTensorDescriptor_t filter_desc, const void *filter,
+    const mluOpTensorDescriptor_t bias_desc, const void *bias,
+    const mluOpTensorDescriptor_t y_desc, void *y) {
+  SyncBatchNormElemtWrapper wrapper;
+  mluOpStatus_t ret =
+      wrapper.invoke(handle, x_desc, x, mean_desc, mean, invstd_desc, invstd,
+                     filter_desc, filter, bias_desc, bias, y_desc, y);
+  return ret;
+}
+
diff --git a/bangc-ops/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp b/bangc-ops/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
new file mode 100644
index 000000000..d99916aa3
--- /dev/null
+++ b/bangc-ops/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "kernels/kernel_wrapper/wrapper.h"
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormGatherStatsWithCounts(
+    mluOpHandle_t handle,
+    const mluOpTensorDescriptor_t mean_all_desc,
+    const void *mean_all,
+    const mluOpTensorDescriptor_t invstd_all_desc,
+    const void *invstd_all,
+    const mluOpTensorDescriptor_t moving_mean_desc,
+    void *moving_mean,
+    const mluOpTensorDescriptor_t moving_var_desc,
+    void *moving_var,
+    float momentum,
+    float eps,
+    const mluOpTensorDescriptor_t count_all_desc,
+    const void *count_all,
+    const mluOpTensorDescriptor_t mean_desc,
+    void *mean,
+    const mluOpTensorDescriptor_t invstd_desc,
+    void *invstd) {
+  SyncBatchNormGatherStatsWithCountsWrapper wrapper;
+  mluOpStatus_t ret = wrapper.invoke(handle, mean_all_desc, mean_all,
+        invstd_all_desc, invstd_all, moving_mean_desc, moving_mean,
+        moving_var_desc, moving_var, momentum, eps, count_all_desc,
+        count_all, mean_desc, mean, invstd_desc, invstd);
+  return ret;
+}
+
diff --git a/bangc-ops/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp b/bangc-ops/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp
new file mode 100644
index 000000000..64b7547af
--- /dev/null
+++ b/bangc-ops/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp
@@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "kernels/kernel_wrapper/wrapper.h"
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormStats(
+    mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x,
+    const float eps, const mluOpTensorDescriptor_t mean_desc, void *mean,
+    const mluOpTensorDescriptor_t invstd_desc, void *invstd) {
+  SyncBatchNormStatsWrapper wrapper;
+  mluOpStatus_t ret = wrapper.invoke(handle, x_desc, x, eps, mean_desc, mean,
+                                     invstd_desc, invstd);
+  return ret;
+}
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormStats_v2(
+    mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x,
+    void *workspace, size_t workspace_size, const float eps,
+    const mluOpTensorDescriptor_t mean_desc, void *mean,
+    const mluOpTensorDescriptor_t invstd_desc, void *invstd) {
+  SyncBatchNormStatsV2Wrapper wrapper;
+  mluOpStatus_t ret =
+      wrapper.invoke(handle, x_desc, x, workspace, workspace_size, eps,
+                     mean_desc, mean, invstd_desc, invstd);
+  return ret;
+}
diff --git a/bangc-ops/mlu_op.h b/bangc-ops/mlu_op.h
index 78343071e..b505b9b6c 100644
--- a/bangc-ops/mlu_op.h
+++ b/bangc-ops/mlu_op.h
@@ -13980,6 +13980,1119 @@ mluOpRoiPoolingBackward(mluOpHandle_t handle,
                         const float spatial_scale,
                         const mluOpTensorDescriptor_t grads_image_desc,
                         void *grads_image);
+
+// Group:SyncBatchNormStats
+/*!
+ * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra
+ * workspace to optimize ::mluOpSyncBatchNormStats_v2 operation.
+ *
+ * The size of extra workspace is based on the given information of ::mluOpSyncBatchNormStats_v2
+ * operation, including the input tensor descriptor \b x_desc.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in
+ * ::mluOpSyncBatchNormStats_v2 operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] x_desc
+ * The descriptor of the input tensor. For detailed information,
+ * see ::mluOpTensorDescriptor_t.
+ * @param[out] workspace_size
+ * Pointer to the returned size of the extra workspace in bytes that is used in the
+ * ::mluOpSyncBatchNormStats_v2 operation.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - None.
+ *
+ * @par Data Layout
+ * - None.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par Note
+ * - This API is only used along with ::mluOpSyncBatchNormStats_v2.
+ * - ::mluOpSyncBatchNormStats does not require this API.
+ *
+ * @par Example
+ * - None.
+ *
+ * @par Reference
+ * - None.
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpGetSyncBatchNormStatsWorkspaceSize(mluOpHandle_t handle,
+                                        const mluOpTensorDescriptor_t x_desc,
+                                        size_t *workspace_size);
+
+// Group:SyncBatchNormStats
+/*!
+ * @brief Computes the local mean and the local inverse standard deviation for each channel
+ * across a batch of data in the training scenario.
+ *
+ * ::mluOpSyncBatchNormStats_v2 is used in convolution network, including but not limited to
+ * ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
+ *
+ * Compared with ::mluOpSyncBatchNormStats, this function allows you to allocate some extra
+ * workspace as an input parameter. If you just set \b workspace to NULL and \b workspace_size
+ * to 0, this function will perform as same as ::mluOpSyncBatchNormStats.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in
+ * ::mluOpSyncBatchNormStats_v2 operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] x_desc
+ * The descriptor of the input tensor \b x. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] x
+ * Pointer to the MLU memory that stores the input tensor \b x.
+ * @param[in] workspace
+ * Pointer to the MLU memory that is used as an extra workspace for ::mluOpSyncBatchNormStats_v2.
+ * @param[in] workspace_size
+ * The size of the extra workspace in bytes that needs to be used in
+ * ::mluOpSyncBatchNormStats_v2. You can get the size of the workspace with
+ * ::mluOpGetSyncBatchNormStatsWorkspaceSize function.
+ * @param[in] eps
+ * A floating-point value added to the denominator for numerical stability.
+ * @param[in] mean_desc
+ * The descriptor of the output tensor \b mean. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] mean
+ * Pointer to the MLU memory that stores the output tensor \b mean, which is the
+ * local mean.
+ * @param[in] invstd_desc
+ * The descriptor of the output tensor \b invstd. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] invstd
+ * Pointer to the MLU memory that stores the output tensor \b invstd, which is the
+ * local inverse standard deviation.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown below with the following order:
+ *   - float(x) - float(eps) - float(mean) - float(invstd).
+ *   - half(x) - float(eps) - float(mean) - float(invstd).
+ *
+ * @par Data Layout
+ * - The supported data layout of the input tensor is shown as follows:
+ *   - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC.
+ * - The layout of the output tensors is shown as follows:
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - Before calling this function to perform ::mluOpSyncBatchNormStats_v2, you need to get
+ *   the size of workspace by ::mluOpGetSyncBatchNormStatsWorkspaceSize.
+ *
+ * @par note
+ * - None.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchNormStats_v2 operation is as follows:
+     @verbatim
+      input five arrays by 1 * 2 * 3 * 2
+      --> x: [[[[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]],
+               [[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]]]]
+      param:
+        eps: 0.00001
+      output an array by 2
+      --> mean: [1.0, 1.0]
+      --> invstd: [316.221, 316.221]
+     @endverbatim
+ *
+ * @par Reference
+ * - https://pytorch.org/docs/1.6.0/jit_builtin_functions.html?highlight=batch_norm_stats
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchNormStats_v2(mluOpHandle_t handle,
+                           const mluOpTensorDescriptor_t x_desc,
+                           const void *x,
+                           void *workspace,
+                           size_t workspace_size,
+                           const float eps,
+                           const mluOpTensorDescriptor_t mean_desc,
+                           void *mean,
+                           const mluOpTensorDescriptor_t invstd_desc,
+                           void *invstd);
+
+// Group:SyncBatchNormStats
+/*!
+ * @brief Computes the local mean and the local inverse standard deviation for each channel
+ * across a batch of data in the training scenario.
+ *
+ * ::mluOpSyncBatchNormStats is used in CNN, including but not limited to
+ * ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the
+ * ::mluOpSyncBatchNormStats operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] x_desc
+ * The descriptor of the input tensor \b x. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] x
+ * Pointer to the MLU memory that stores the input tensor \b x.
+ * @param[in] eps
+ * A floating-point value added to the denominator for numerical stability.
+ * @param[in] mean_desc
+ * The descriptor of the output tensor \b mean. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] mean
+ * Pointer to the MLU memory that stores the output tensor \b mean, which is the
+ * local mean.
+ * @param[in] invstd_desc
+ * The descriptor of the output tensor \b invstd. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] invstd
+ * Pointer to the MLU memory that stores the output tensor \b invstd, which is the
+ * local inverse standard deviation.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown below with the following order:
+ *   - \b x - \b eps - \b mean - \b invstd
+ *   - The supported data type combinations are:
+ *     - float - float - float - float.
+ *     - half - float - float - float.
+ *
+ * @par Data Layout
+ * - The supported data layout of the input tensor is shown as follows:
+ *   - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC.
+ * - The layout of the output tensors is shown as follows:
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par note
+ * - None.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchNormStats operation is as follows:
+     @verbatim
+      input five arrays by 1 * 2 * 3 * 2
+      --> x: [[[[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]],
+               [[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]]]]
+      param:
+        eps: 0.00001
+      output an array by 2
+      --> mean: [1.0, 1.0]
+      --> invstd: [316.221, 316.221]
+     @endverbatim
+ *
+ * @par Reference
+ * - https://pytorch.org/docs/1.6.0/jit_builtin_functions.html?highlight=batch_norm_stats
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchNormStats(mluOpHandle_t handle,
+                        const mluOpTensorDescriptor_t x_desc,
+                        const void *x,
+                        const float eps,
+                        const mluOpTensorDescriptor_t mean_desc,
+                        void *mean,
+                        const mluOpTensorDescriptor_t invstd_desc,
+                        void *invstd);
+
+// Group:SyncBatchNormGatherStatsWithCounts
+/*!
+ * @brief Computes the global mean and the global inverse standard deviation across aggregation
+ * of the local mean and local inverse standard deviation of multiple MLU devices.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in
+ * ::mluOpSyncBatchNormGatherStatsWithCounts. For detailed information,
+ * see ::mluOpHandle_t.
+ * @param[in] mean_all_desc
+ * The descriptor of the input tensor \b mean_all. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] mean_all
+ * Pointer to the MLU memory that stores the input tensor \b mean_all, which is
+ * the local mean of multiple MLU devices.
+ * @param[in] invstd_all_desc
+ * The descriptor of the input tensor \b invstd_all. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] invstd_all
+ * Pointer to the MLU memory that stores the input tensor \n invstd_all, which
+ * is the local inverse standard deviation of multiple MLU devices.
+ * @param[in] moving_mean_desc
+ * The descriptor of the input tensor \b moving_mean. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in,out] moving_mean
+ * Pointer to the MLU memory that stores the input tensor \b moving_mean,
+ * which is the moving average of mean computed over the dimensions of the input tensor
+ * \b mean_all. The value of this pointer can be NULL.
+ * @param[in] moving_var_desc
+ * The descriptor of the input tensor \b moving_var. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in,out] moving_var
+ * Pointer to the MLU memory that stores the tensor \b moving_var, which is
+ * the moving average of inverse standard deviation computed over the dimensions of the input
+ * tensor \b invstd_all. The value of this pointer can be NULL.
+ * @param[in] momentum
+ * A floating-point value used to do moving average of \b moving_mean and \b moving_var.
+ * @param[in] eps
+ * A floating-point value added to the denominator for numerical stability.
+ * @param[in] count_all_desc
+ * The descriptor of the input tensor \b count_all. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] count_all
+ * Pointer to the MLU memory that stores an array, which stores the total size of
+ * dimensions (except C dimension) of input for each MLU device.
+ * @param[in] mean_desc
+ * The descriptor of the output tensor \b mean. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] mean
+ * Pointer to the MLU memory that stores the output tensor \b mean, which is the
+ * global mean.
+ * @param[in] invstd_desc
+ * The descriptor of the output tensor \b invstd. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] invstd
+ * Pointer to the MLU memory that stores the output tensor \b invstd, which is the
+ * global inverse standard deviation.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown as the following order:
+ *   - mean_all - invstd_all - moving_mean - moving_var - momentum -  eps  - count_all - mean  - invstd
+ *   -  float   -   float    -    float    -    float   -   float  - float -   float   - float -  float.
+ *   -  float   -   float    -    half     -    half    -   float  - float -   half    - float -  float.
+ *
+ * @par Data Layout
+ * - The supported data layout of the input tensors is shown as follows:
+ *   - mean_all tensor: \p MLUOP_LAYOUT_NC.
+ *   - invstd_all tensor: \p MLUOP_LAYOUT_NC.
+ *   - moving_mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - moving_var tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - momentum: Scalar.
+ *   - eps: Scalar.
+ *   - count_all tensor: \p MLUOP_LAYOUT_ARRAY.
+ * - The layout of the output tensors is shown as follows:
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par note
+ * - The input \b mean_all and the input \b invstd_all cannot be positive infinity or negative infinity
+ *   at the same time on MLU300 series or above.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchNormGatherStatsWithCounts operation is as follows:
+     @verbatim
+      --> mean_all: an array [8, 1024];
+      --> invstd_all: an array [8, 1024];
+      --> moving_mean: an array [1024];
+      --> moving_var: an array [1024];
+      --> count_all: an array [8];
+      param:
+      --> momentum: 0.1
+      --> eps: 0.00001
+      output:
+      --> mean: an array [1024];
+      --> invstd: [1024];
+     @endverbatim
+ *
+ * @par Reference
+ * - https://pytorch.org/docs/1.6.0/jit_builtin_functions.html?highlight=batch_norm_stats
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchNormGatherStatsWithCounts(mluOpHandle_t handle,
+                                        const mluOpTensorDescriptor_t mean_all_desc,
+                                        const void *mean_all,
+                                        const mluOpTensorDescriptor_t invstd_all_desc,
+                                        const void *invstd_all,
+                                        const mluOpTensorDescriptor_t moving_mean_desc,
+                                        void *moving_mean,
+                                        const mluOpTensorDescriptor_t moving_var_desc,
+                                        void *moving_var,
+                                        float momentum,
+                                        float eps,
+                                        const mluOpTensorDescriptor_t count_all_desc,
+                                        const void *count_all,
+                                        const mluOpTensorDescriptor_t mean_desc,
+                                        void *mean,
+                                        const mluOpTensorDescriptor_t invstd_desc,
+                                        void *invstd);
+
+// Group:SyncBatchNormElemt
+/*!
+ * @brief Applies Batch Normalization for each channel across a batch of data with the given mean,
+ *        inverse variance and scaling factors.
+ *
+ * Batch Normalization is used in artificial intelligence, including but not limited to
+ * ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in
+ * ::mluOpSyncBatchNormElemt. For detailed information, see ::mluOpHandle_t.
+ * @param[in] x_desc
+ * The descriptor of the input tensor \b x. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] x
+ * Pointer to the MLU memory that stores the input tensor \b x.
+ * @param[in] mean_desc
+ * The descriptor of \b mean tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] mean
+ * Pointer to the MLU memory that stores the tensor \b mean, which is computed over the
+ * batch and spatial dimensions by ::mluOpSyncBatchNormGatherStatsWithCounts.
+ * @param[in] invstd_desc
+ * The descriptor of \b invstd tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] invstd
+ * Pointer to the MLU memory that stores the tensor \b invstd, which is the inverse variance
+ * computed over the batch and spatial dimensions by ::mluOpSyncBatchNormGatherStatsWithCounts.
+ * @param[in] filter_desc
+ * The descriptor of \b filter tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * The descriptor can be NULL when \b filter pointer is NULL.
+ * @param[in] filter
+ * Pointer to the MLU memory that stores the input tensor \b filter for affine transformation
+ * after batch normilization. The value of this pointer can be NULL.
+ * @param[in] bias_desc
+ * The descriptor of \b bias tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * The descriptor can be NULL when \b bias pointer is NULL.
+ * @param[in] bias
+ * Pointer to the MLU memory that stores the input tensor \b bias for affine transformation
+ * after batch normalization. The value of this pointer can be NULL.
+ * @param[in] y_desc
+ * The descriptor of the sync batch normalization output tensor \b y. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] y
+ * Pointer to the MLU memory that stores the output tensor \b y.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown below with the following order:
+ *   - x_tensor - mean_tensor - invstd_tensor - filter_tensor - bias_tensor - y_tensor
+ *   - float - float - float - float - float - float.
+ *   - half - float - float - float - float - half.
+ *
+ * @par Data Layout
+ * - The supported data layout of \b x, \b mean, \b invstd, \b filter, \b bias and \b y is as follows:
+ *   - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC.
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - filter tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - bias tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - y tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC.
+ *     The layout of the \b y should be the same as \b x tensor.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par note
+ * - The \b mean, \b invstd, \b filter and \b \b bias must be 1D tensors and the length of their dimensions
+ *   should be the same as the length of the lowest dimension of \b x.
+ * - The length of each dimension of \b x and \b y must be the same.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchNormElemt operation is as follows:
+     @verbatim
+      input five arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2
+      --> x: [[[[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]],
+               [[1.0, 1.0],[1.0, 1.0],[1.0, 1.0]]]]
+
+      --> mean: [0.5, 0.5]
+
+      --> invstd: [2.0, 2.0]
+
+      --> filter: [0.5, 0.5]
+
+      --> bias: [1.0, 1.0]
+
+      output array by 1 * 2 * 3 * 2
+      --> y: [[[[1.5, 1.5],[1.5, 1.5],[1.5, 1.5]],
+               [[1.5, 1.5],[1.5, 1.5],[1.5, 1.5]]]]
+     @endverbatim
+ *
+ * @par Reference
+ * - Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift,
+ *   Sergey Ioffe, 2015.
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchNormElemt(mluOpHandle_t handle,
+                        const mluOpTensorDescriptor_t x_desc,
+                        const void *x,
+                        const mluOpTensorDescriptor_t mean_desc,
+                        const void *mean,
+                        const mluOpTensorDescriptor_t invstd_desc,
+                        const void *invstd,
+                        const mluOpTensorDescriptor_t filter_desc,
+                        const void *filter,
+                        const mluOpTensorDescriptor_t bias_desc,
+                        const void *bias,
+                        const mluOpTensorDescriptor_t y_desc,
+                        void *y);
+
+// Group:SyncBatchnormBackwardReduce
+/*!
+ * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra
+ * workspace to optimize the sync_batchnorm_backward_reduce operation.
+ *
+ * The size of extra workspace is based on the given information of
+ * ::mluOpSyncBatchnormBackwardReduce_v2 operation, including the input tensor descriptor \b x_desc.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the mse_loss
+ * operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] x_desc
+ * The descriptor of the input tensor. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] workspace_size
+ * Pointer to the returned size of the extra workspace in bytes that is used in
+ * ::mluOpSyncBatchnormBackwardReduce_v2 operation.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - None.
+ *
+ * @par Data Layout
+ * - None.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par note
+ * - This API is only used along with ::mluOpSyncBatchnormBackwardReduce_v2.
+ * - ::mluOpSyncBatchnormBackwardReduce does not require this API.
+ *
+ * @par Example
+ * - None.
+ *
+ * @par Reference
+ * - None.
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(mluOpHandle_t handle,
+                                                 const mluOpTensorDescriptor_t x_desc,
+                                                 size_t *workspace_size);
+
+// Group:SyncBatchnormBackwardReduce
+/*!
+ * @brief Applies Synchronized Batch Normalization Reduce operator to backwardly compute grad
+ * filters, grad bias, sum_dy and sum_dy_xmu on each MLU device.
+ *
+ * Batch Normalization is used in convolution network, including but not limited to
+ * ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
+ *
+ * Compared with ::mluOpSyncBatchnormBackwardReduce, this function allows you to allocate some extra
+ * workspace as an input parameter. If you just set \b workspace to NULL and \b workspace_size to 0,
+ * this function will perform as same as ::mluOpSyncBatchnormBackwardReduce.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in
+ * ::mluOpSyncBatchnormBackwardReduce_v2 operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] desc_dz
+ * The descriptor of the input tensor \b dz. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] dz
+ * Pointer to the MLU memory that stores the tensor \b dz, which denotes the partial
+ * derivative of batch normalization forward output.
+ * @param[in] desc_x
+ * The descriptor of the input tensor \b x. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] x
+ * Pointer to the MLU memory that stores the input tensor \b x.
+ * @param[in] mean_desc
+ * The descriptor of \b mean tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] mean
+ * Pointer to the MLU memory that stores the tensor \b mean, which denotes the average
+ * result of input \b x.
+ * @param[in] desc_invstd
+ * The descriptor of \b invstd tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] invstd
+ * Pointer to the MLU memory that stores the tensor \b invstd, which denotes the inversed
+ * standard deviation of input \b x.
+ * @param[in] workspace
+ * Pointer to the MLU memory that is used as an extra workspace for
+ * ::mluOpSyncBatchnormBackwardReduce_v2.
+ * @param[in] workspace_size
+ * The size of the extra workspace in bytes that needs to be used in
+ * the ::mluOpSyncBatchnormBackwardReduce_v2. You can get the size of the workspace with
+ * the ::mluOpGetSyncBatchnormBackwardReduceWorkspaceSize function.
+ * @param[out] desc_dfilters
+ * The descriptor of \b dfilters tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[out] dfilters
+ * Pointer to the MLU memory that stores the input tensor \b dfilters, which denotes
+ * partial derivative of filter in sync batch normalization forward training. It will be computed
+ * only if booleanvariable \b needs_input_grad1 is true.
+ * @param[out] desc_dbias
+ * The descriptor of the sync batch normalization output tensor \b dbias. For detailed
+ * information, see ::mluOpTensorDescriptor_t.
+ * @param[out] dbias
+ * Pointer to the MLU memory that stores the output tensor \b dbias, which denotes partial
+ * derivative of bias in sync batch normalization forward training. It will be computed
+ * only if \b needs_input_grad2 is true.
+ * @param[out] desc_sum_dy
+ * The descriptor of the sync batch normalization output tensor \b sum_dy. For detailed
+ * information, see ::mluOpTensorDescriptor_t.
+ * @param[out] sum_dy
+ * Pointer to the MLU memory that stores the output tensor \b sum_dy, which denotes the
+ * summation of dz and is also an intermediate variable to compute the partial derivative of
+ * input x. Moreover, it will be computed only if boolean variable \b needs_input_grad0 is true.
+ * @param[out] desc_sum_dy_xmu
+ * The descriptor of the sync batch normalization output tensor \b sum_dy_xmu. For detailed
+ * information, see ::mluOpTensorDescriptor_t.
+ * @param[out] sum_dy_xmu
+ * Pointer to the MLU memory that stores the output tensor \b sum_dy_xmu, which denotes
+ * sum{dz(x-mean)}. It is also an intermediate variable to compute the partial derivative of
+ * input \b x. Moreover, it will be computed only if boolean variable \b needs_input_grad0 is
+ * true.
+ * @param[in] needs_input_grad0
+ * A boolean variable that determines whether to compute \b sum_dy and \b sum_dy_xmu.
+ * When \b needs_input_grad0 is true, \b sum_dy and \b sum_dy_xmu will be computed.
+ * When \b needs_input_grad0 is false, \b sum_dy and \b sum_dy_xmu will be NULL.
+ * @param[in] needs_input_grad1
+ * A boolean variable that determines whether to compute \b dfilters.
+ * When \b needs_input_grad1 is true, \b dfilters will be computed.
+ * When \b needs_input_grad1 is false, \b dfilter will be NULL.
+ * @param[in] needs_input_grad2
+ * A boolean variable that determines whether to compute \b dbias.
+ * When \b needs_input_grad2 is true, \b dbias will be computed.
+ * When \b needs_input_grad2 is false, \b dbias will be NULL.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown below with the following order:
+ *   - dz_tensor - x_tensor - mean_tensor - invstd_tensor - dfilter_tensor - dbias_tensor -
+ *   sum_dy_tensor - sum_dy_xmu_tensor
+ *   - float - float - float - float - float - float - float - float.
+ *   - half - half - float - float - float - float - float - float.
+ *
+ * @par Data Layout
+ * - The supported data layout of \b dz, \b x, \b mean, \b invstd, \b dfilter, \b dbias, \b sum_dy
+ *   and \b sum_dy_xmu is as follows:
+ *   - dz tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC.
+ *   - x tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC.
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - dfilter tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - dbias tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - Before calling this function to perform ::mluOpSyncBatchnormBackwardReduce_v2, you need to get
+ *   the size of workspace by ::mluOpGetSyncBatchnormBackwardReduceWorkspaceSize.
+ *
+ * @par note
+ * - The \b mean, \b invstd, \b dfilter, \b bias, \b sum_dy and \b sum_dy_xmu must be 1D tensors
+ *   and the length of the dimensions of these tensors should be the same as the length of
+ *   the lowest dimension of \b x.
+ * - The length of each dimension of \b x and \b dz must be the same.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchnormBackwardReduce_v2 operation is as follows:
+     @verbatim
+      input four arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2
+      --> dz: [[[[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]],
+               [[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]]]]
+
+      --> x: [[[[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]],
+               [[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]]]]
+
+      --> mean: [1, 1]
+
+      --> invstd: [0.8, 0.8]
+
+      output array by 2
+      --> dfilter: [57.6, 57.6]
+
+      --> dbias: [36.0, 36.0]
+
+      --> sum_dy: [36.0, 36.0]
+
+      --> sum_dy_xmu: [72.0, 72.0]
+     @endverbatim
+ *
+ * @par Reference
+ * - Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift,
+ *   Sergey Ioffe, 2015.
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchnormBackwardReduce_v2(mluOpHandle_t handle,
+                                    const mluOpTensorDescriptor_t desc_dz,
+                                    const void *dz,
+                                    const mluOpTensorDescriptor_t desc_x,
+                                    const void *x,
+                                    const mluOpTensorDescriptor_t desc_mean,
+                                    const void *mean,
+                                    const mluOpTensorDescriptor_t desc_invstd,
+                                    const void *invstd,
+                                    void *workspace,
+                                    size_t workspace_size,
+                                    const mluOpTensorDescriptor_t desc_dfilter,
+                                    void *dfilter,
+                                    const mluOpTensorDescriptor_t desc_dbias,
+                                    void *dbias,
+                                    const mluOpTensorDescriptor_t desc_sum_dy,
+                                    void *sum_dy,
+                                    const mluOpTensorDescriptor_t desc_sum_dy_xmu,
+                                    void *sum_dy_xmu,
+                                    const bool needs_input_grad0,
+                                    const bool needs_input_grad1,
+                                    const bool needs_input_grad2);
+
+// Group:SyncBatchnormBackwardReduce
+/*!
+ * @brief Applies Synchronized Batch Normalization Reduce operator to backwardly compute grad filters,
+ * grad bias, sum_dy and sum_dy_xmu on each MLU device.
+ *
+ * Batch Normalization is used in CNN, including but not limited to
+ * ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the
+ * ::mluOpSyncBatchnormBackwardReduce operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] desc_dz
+ * The descriptor of the input tensor \b dz. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] dz
+ * Pointer to the MLU memory that stores the tensor \b dz, which denotes the partial derivative of
+ * batch normalization forward output.
+ * @param[in] desc_x
+ * The descriptor of the input tensor \b x. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] x
+ * Pointer to the MLU memory that stores the input tensor \b x.
+ * @param[in] mean_desc
+ * The descriptor of \b mean tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] mean
+ * Pointer to the MLU memory that stores the tensor \b mean, which denotes the average result of
+ * input \b x.
+ * @param[in] desc_invstd
+ * The descriptor of \b invstd tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] invstd
+ * Pointer to the MLU memory that stores the tensor \b invstd, which denotes the inversed standard deviation
+ * of input \b x.
+ * @param[out] desc_dfilter
+ * The descriptor of \b dfilter tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[out] dfilter
+ * Pointer to the MLU memory that stores the input tensor \b dfilter, which denotes partial derivative
+ * of filter in sync batch normalization forward training. It will be computed only if boolean variable
+ * \b needs_input_grad1 is true.
+ * @param[out] desc_dbias
+ * The descriptor of the sync batch normalization output tensor \b dbias. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] dbias
+ * Pointer to the MLU memory that stores the output tensor \b dbias, which denotes partial derivative of
+ * bias in sync batch normalization forward training. It will be computed only if \b needs_input_grad2 is true.
+ * @param[out] desc_sum_dy
+ * The descriptor of the sync batch normalization output tensor \b sum_dy. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] sum_dy
+ * Pointer to the MLU memory that stores the output tensor \b sum_dy, which denotes the summation of dz
+ * and is also an intermediate variable to compute the partial derivative of input x. Moreover, it will be
+ * computed only if boolean variable \b needs_input_grad0 is true.
+ * @param[out] desc_sum_dy_xmu
+ * The descriptor of the sync batch normalization output tensor \b sum_dy_xmu. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] sum_dy_xmu
+ * Pointer to the MLU memory that stores the output tensor \b sum_dy_xmu, which denotes sum{dz(x-mean)}.
+ * It is also an intermediate variable to compute the partial derivative of
+ * input \b x. Moreover, it will be computed only if boolean variable \b needs_input_grad0 is true.
+ * @param[in] needs_input_grad0
+ * A boolean variable that determines whether to compute \b sum_dy and \b sum_dy_xmu.
+ * When \b needs_input_grad0 is true, \b sum_dy and \b sum_dy_xmu will be computed.
+ * When \b needs_input_grad0 is false, \b sum_dy and \b sum_dy_xmu will be NULL.
+ * @param[in] needs_input_grad1
+ * A boolean variable that determines whether to compute \b dfilters.
+ * When \b needs_input_grad1 is true, \b dfilters will be computed.
+ * When \b needs_input_grad1 is false, \b dfilter will be NULL.
+ * @param[in] needs_input_grad2
+ * A boolean variable that determines whether to compute \b dbias.
+ * When \b needs_input_grad2 is true, \b dbias will be computed.
+ * When \b needs_input_grad2 is false, \b dbias will be NULL.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown below with the following order:
+ *   - dz_tensor - x_tensor - mean_tensor - invstd_tensor - dfilter_tensor - dbias_tensor - sum_dy_tensor
+ *   - sum_dy_xmu_tensor
+ *   - float - float - float - float - float - float - float - float.
+ *   - half - half - float - float - float - float - float - float.
+ *
+ * @par Data Layout
+ * - The supported data layout of \b dz, \b x, \b mean, \b invstd, \b dfilter, \b dbias, \b sum_dy and
+ *   \b sum_dy_xmu is as follows:
+ *   - dz tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC.
+ *   - x tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC.
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - dfilter tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - dbias tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par note
+ * - The \b mean, \b invstd, \b dfilter, \b bias, \b sum_dy and \b sum_dy_xmu must be 1D tensors and the
+ *   length of the dimensions of these tensors should be the same as the length of the lowest dimension of \b x.
+ * - The length of each dimension of \b x and \b dz must be the same.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchnormBackwardReduce operation is as follows:
+     @verbatim
+      input four arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2
+      --> dz: [[[[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]],
+               [[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]]]]
+
+      --> x: [[[[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]],
+               [[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]]]]
+
+      --> mean: [1, 1]
+
+      --> invstd: [0.8, 0.8]
+
+      output array by 2
+      --> dfilter: [57.6, 57.6]
+
+      --> dbias: [36.0, 36.0]
+
+      --> sum_dy: [36.0, 36.0]
+
+      --> sum_dy_xmu: [72.0, 72.0]
+     @endverbatim
+ *
+ * @par Reference
+ * - Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift,
+ *   Sergey Ioffe, 2015.
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchnormBackwardReduce(mluOpHandle_t handle,
+                                 const mluOpTensorDescriptor_t desc_dz,
+                                 const void *dz,
+                                 const mluOpTensorDescriptor_t desc_x,
+                                 const void *x,
+                                 const mluOpTensorDescriptor_t desc_mean,
+                                 const void *mean,
+                                 const mluOpTensorDescriptor_t desc_invstd,
+                                 const void *invstd,
+                                 const mluOpTensorDescriptor_t desc_dfilter,
+                                 void *dfilter,
+                                 const mluOpTensorDescriptor_t desc_dbias,
+                                 void *dbias,
+                                 const mluOpTensorDescriptor_t desc_sum_dy,
+                                 void *sum_dy,
+                                 const mluOpTensorDescriptor_t desc_sum_dy_xmu,
+                                 void *sum_dy_xmu,
+                                 const bool needs_input_grad0,
+                                 const bool needs_input_grad1,
+                                 const bool needs_input_grad2);
+
+// Group:SyncBatchNormBackwardElemt
+/*!
+ * @brief Computes the gradients of input in the training scenario.
+ *
+ * This function is used in artificial intelligence, including but not limited
+ * to ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in the
+ * ::mluOpSyncBatchNormBackwardElemt operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] diff_y_desc
+ * The descriptor of the backpropagated differential tensor \b diff_y. For
+ * detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] diff_y
+ * Pointer to the MLU memory that stores the backpropagated differential tensor.
+ * @param[in] x_desc
+ * The descriptor of the input tensor \b x. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] x
+ * Pointer to the MLU memory that stores the input tensor.
+ * @param[in] mean_desc
+ * The descriptor of the input tensor \b mean. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] mean
+ * Pointer to the MLU memory that stores the global mean.
+ * @param[in] invstd_desc
+ * The descriptor of the input tensor \b invstd. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] invstd
+ * Pointer to the MLU memory that stores the global inverse standard deviation.
+ * @param[in] filter_desc
+ * The descriptor of the input tensor \b filter. For detailed information, see
+ * ::mluOpTensorDescriptor_t. The descriptor can be NULL when \b filter pointer is NULL.
+ * @param[in] filter
+ * Pointer to the MLU memory that stores the input tensor \b filter for affine
+ * transformation after batch normilization. The value of this pointer can be NULL.
+ * @param[in] mean_dy_desc
+ * The descriptor of the input tensor \b mean_dy. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] mean_dy
+ * Pointer to the MLU memory that stores the mean of diff_y.
+ * @param[in] mean_dy_xmu_desc
+ * The descriptor of the input tensor \b mean_dy_xmu. For detailed information,
+ * see ::mluOpTensorDescriptor_t.
+ * @param[in] mean_dy_xmu
+ * Pointer to the MLU memory that stores the mean of the result of diff_y * (x - mean).
+ * @param[in] diff_x_desc
+ * The descriptor of the output tensor \b diff_x. For detailed information,
+ * see ::mluOpTensorDescriptor_t.
+ * @param[out] diff_x
+ * Pointer to the MLU memory that stores the derivative of input.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown below:
+ *   - float(\b diff_y) - float(\b x) - float(\b mean) - float(\b invstd) - float(\b filter) -
+ *     float(\b mean_dy) - float(\b mean_dy_xmu) - float(\b diff_x).
+ *   - half(\b diff_y) - half(\b x) - float(\b mean) - float(\b invstd) - float(\b filter) -
+ *     float(\b mean_dy) - float(\b mean_dy_xmu) - half(\b diff_x).
+ *
+ * @par Data Layout
+ * - The supported data layout of \b diff_y, \b x, \b mean, \b invstd, \b filter, \b mean_dy,
+ *   \b mean_dy_xmu and \b diff_x is as follows:
+ *   - diff_y tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and
+ *     \p MLUOP_LAYOUT_NLC.
+ *   - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC.
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - filter tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - mean_dy tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - mean_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - diff_x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and
+ *     \p MLUOP_LAYOUT_NLC.
+ * - The layouts of the \b diff_x \b x and \b diff_y should be the same.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par note
+ * - The \b mean, \b invstd, \b filter, \b mean_dy and \b mean_dy_xmu must be 1D tensors and the
+ *   length of the dimension of these tensors should be the same as the length of the lowest
+ *   dimension of \b x.
+ * - The length of each dimension of \b diff_y, \b x and \b diff_x must be the same.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchNormBackwardElemt operation is as follows:
+     @verbatim
+      input seven arrays by 1, 1, 1, 1, 1, 1, 1 and 1
+      --> diff_y: [[[[1.0]]]]
+      --> x: [[[[2.0]]]]
+      --> mean: [3.0]
+      --> invstd: [4.0]
+      --> filter: [5.0]
+      --> mean_dy: [6.0]
+      --> mean_dy_xmu: [7.0]
+
+      output an array by 1
+      --> mean: [[[[-8960.0]]]]
+     @endverbatim
+ *
+ * @par Reference
+ * - https://pytorch.org/docs/1.6.0/jit_builtin_functions.html?highlight=batch_norm_backward_elemt
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchNormBackwardElemt(mluOpHandle_t handle,
+                                const mluOpTensorDescriptor_t diff_y_desc,
+                                const void *diff_y,
+                                const mluOpTensorDescriptor_t x_desc,
+                                const void *x,
+                                const mluOpTensorDescriptor_t mean_desc,
+                                const void *mean,
+                                const mluOpTensorDescriptor_t invstd_desc,
+                                const void *invstd,
+                                const mluOpTensorDescriptor_t filter_desc,
+                                const void *filter,
+                                const mluOpTensorDescriptor_t mean_dy_desc,
+                                const void *mean_dy,
+                                const mluOpTensorDescriptor_t mean_dy_xmu_desc,
+                                const void *mean_dy_xmu,
+                                const mluOpTensorDescriptor_t diff_x_desc,
+                                void *diff_x);
+
+// Group:SyncBatchNormBackwardElemt
+/*!
+ * @brief Computes the gradients of input in the training scenario.
+ *
+ * This function is used in ResNet (Residual Network), Yolo (You Only Look Once) and
+ * R-CNN (Regions with CNN features).
+ *
+ * Compared with ::mluOpSyncBatchNormBackwardElemt, this function first computes the intermediate
+ * results mean_dy and mean_dy_xmu based on \b sum_dy, \b sum_dy_xmu and \b count, and then
+ * computes the gradient of \b x with the intermediate results.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLUOP context that is used to manage MLU devices and queues in
+ * ::mluOpSyncBatchNormBackwardElemtV2 operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] diff_y_desc
+ * The descriptor of the backpropagated differential tensor \b diff_y. For
+ * detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] diff_y
+ * Pointer to the MLU memory that stores the backpropagated differential tensor.
+ * @param[in] x_desc
+ * The descriptor of the input tensor \b x. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] x
+ * Pointer to the MLU memory that stores the input tensor.
+ * @param[in] mean_desc
+ * The descriptor of the input tensor \b mean. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] mean
+ * Pointer to the MLU memory that stores the global mean.
+ * @param[in] invstd_desc
+ * The descriptor of the input tensor \b invstd. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] invstd
+ * Pointer to the MLU memory that stores the global inverse standard deviation.
+ * @param[in] filter_desc
+ * The descriptor of the input tensor \b filter. For detailed information, see
+ * ::mluOpTensorDescriptor_t. The descriptor can be NULL when \b filter pointer is NULL.
+ * @param[in] filter
+ * Pointer to the MLU memory that stores the input tensor \b filter for affine
+ * transformation after batch normalization. The value of this pointer can be NULL.
+ * @param[in] sum_dy_desc
+ * The descriptor of the input tensor \b sum_dy. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] sum_dy
+ * Pointer to the MLU memory that stores the sum of diff_y.
+ * @param[in] sum_dy_xmu_desc
+ * The descriptor of the input tensor \b sum_dy_xmu. For detailed information,
+ * see ::mluOpTensorDescriptor_t.
+ * @param[in] sum_dy_xmu
+ * Pointer to the MLU memory that stores the sum of the result of diff_y * (x - mean).
+ * @param[in] count_desc
+ * The descriptor of the input tensor \b count. For detailed information,
+ * see ::mluOpTensorDescriptor_t.
+ * @param[in] count
+ * Pointer to the MLU memory that stores the number of the high dimensions (the dimensions
+ * except the lowest dimension) of the input tensor \b x on all MLU devices.
+ * @param[in] diff_x_desc
+ * The descriptor of the output tensor \b diff_x.
+ * @param[out] diff_x
+ * Pointer to the MLU memory that stores the derivative of input.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM.
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown below:
+ *   - float(\b diff_y) - float(\b x) - float(\b mean) - float(\b invstd) - float(\b filter) -
+ *     float(\b sum_dy) - float(\b sum_dy_xmu) - int32_t(\b count) - float(\b diff_x).
+ *   - half(\b diff_y) - half(\b x) - float(\b mean) - float(\b invstd) - float(\b filter) -
+ *     float(\b sum_dy) - float(\b sum_dy_xmu) - int32_t(\b count) - half(\b diff_x).
+ *
+ * @par Data Layout
+ * - The supported data layouts of \b diff_y, \b x, \b mean, \b invstd, \b filter, \b sum_dy,
+ *   \b sum_dy_xmu and \b diff_x is as follows:
+ *   - diff_y tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and
+ *     \p MLUOP_LAYOUT_NLC.
+ *   - x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and \p MLUOP_LAYOUT_NLC.
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - filter tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - diff_x tensor: \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NC and
+ *     \p MLUOP_LAYOUT_NLC.
+ * - The layouts of the \b diff_x \b x and \b diff_y should be the same.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par note
+ * - The \b mean, \b invstd, \b filter, \b sum_dy and \b sum_dy_xmu must be 1D tensors and the
+ *   length of the dimension of these tensors should be the same as the length of the lowest
+ *   dimension of \b x.
+ * - The length of each dimension of \b diff_y, \b x and \b diff_x must be the same.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchNormBackwardElemtV2 operation is as follows:
+     @verbatim
+      input seven arrays by 1, 1, 1, 1, 1, 1, 1 and 1
+      --> diff_y: [[[[1.0]]]]
+      --> x: [[[[2.0]]]]
+      --> mean: [3.0]
+      --> invstd: [4.0]
+      --> filter: [5.0]
+      --> sum_dy: [6.0]
+      --> sum_dy_xmu: [7.0]
+      --> count: [1]
+
+      output an array by 1
+      --> mean: [[[[-8960.0]]]]
+     @endverbatim
+ *
+ * @par Reference
+ * - https://pytorch.org/docs/1.11.0/jit_builtin_functions.html?highlight=batch_norm_backward_elemt
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchNormBackwardElemtV2(mluOpHandle_t handle,
+                                  const mluOpTensorDescriptor_t diff_y_desc,
+                                  const void *diff_y,
+                                  const mluOpTensorDescriptor_t x_desc,
+                                  const void *x,
+                                  const mluOpTensorDescriptor_t mean_desc,
+                                  const void *mean,
+                                  const mluOpTensorDescriptor_t invstd_desc,
+                                  const void *invstd,
+                                  const mluOpTensorDescriptor_t filter_desc,
+                                  const void *filter,
+                                  const mluOpTensorDescriptor_t sum_dy_desc,
+                                  const void *sum_dy,
+                                  const mluOpTensorDescriptor_t sum_dy_xmu_desc,
+                                  const void *sum_dy_xmu,
+                                  const mluOpTensorDescriptor_t count_desc,
+                                  const void *count,
+                                  const mluOpTensorDescriptor_t diff_x_desc,
+                                  void *diff_x);
 #if defined(__cplusplus)
 }
 #endif
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto b/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto
index 52e5ca575..c2bd2338c 160000
--- a/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto
@@ -1 +1 @@
-Subproject commit 52e5ca57549553dded7687b7a0762caac7ad39d6
+Subproject commit c2bd2338c67ccb4e98968563315ba27950ce68e7
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
new file mode 100644
index 000000000..8334dd344
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
@@ -0,0 +1,158 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "sync_batch_norm_backward_elemt.h"
+
+#include <memory>
+
+namespace mluoptest {
+
+void cpuSyncBatchNormBackwardElemt(const float *x, const float *diff_y,
+                                   const float *weight, const float *mean,
+                                   const float *invstd, const float *mean_dy,
+                                   const float *mean_dy_xmu, float *diff_x,
+                                   const int len_x, const int len_c) {
+  int len_nhw = len_x / len_c;
+  for (int ci = 0; ci < len_c; ++ci) {
+    for (int i = 0; i < len_nhw; ++i) {
+      if (weight == nullptr) {
+        diff_x[i * len_c + ci] = (diff_y[i * len_c + ci] - mean_dy[ci] -
+                                  (x[i * len_c + ci] - mean[ci]) * invstd[ci] *
+                                      invstd[ci] * mean_dy_xmu[ci]) *
+                                 invstd[ci];
+      } else {
+        diff_x[i * len_c + ci] = (diff_y[i * len_c + ci] - mean_dy[ci] -
+                                  (x[i * len_c + ci] - mean[ci]) * invstd[ci] *
+                                      invstd[ci] * mean_dy_xmu[ci]) *
+                                 weight[ci] * invstd[ci];
+      }
+    }
+  }
+}
+
+void SyncBatchNormBackwardElemtExecutor::paramCheck() {
+  GTEST_CHECK(parser_->getInputNum() == 6 || parser_->getInputNum() == 7,
+              "SyncBatchNormBackwardElemtExecutor: input number is wrong.");
+  GTEST_CHECK(parser_->getOutputNum() == 1,
+              "SyncBatchNormBackwardElemtExecutor: output number is wrong.");
+}
+
+void SyncBatchNormBackwardElemtExecutor::compute() {
+  mluOpTensorDescriptor_t x_desc, diff_y_desc, diff_x_desc;
+  mluOpTensorDescriptor_t mean_desc, invstd_desc, weight_desc, mean_dy_desc,
+      mean_dy_xmu_desc;
+
+  diff_y_desc = tensor_desc_[0].tensor;
+  x_desc = tensor_desc_[1].tensor;
+  mean_desc = tensor_desc_[2].tensor;
+  invstd_desc = tensor_desc_[3].tensor;
+  if (parser_->getInputNum() == 7) {
+    weight_desc = tensor_desc_[4].tensor;
+    mean_dy_desc = tensor_desc_[5].tensor;
+    mean_dy_xmu_desc = tensor_desc_[6].tensor;
+    diff_x_desc = tensor_desc_[7].tensor;
+  } else {
+    weight_desc = nullptr;
+    mean_dy_desc = tensor_desc_[4].tensor;
+    mean_dy_xmu_desc = tensor_desc_[5].tensor;
+    diff_x_desc = tensor_desc_[6].tensor;
+  }
+
+  void *dev_diff_y = data_vector_[0].device_ptr;
+  void *dev_x = data_vector_[1].device_ptr;
+  void *dev_mean = data_vector_[2].device_ptr;
+  void *dev_invstd = data_vector_[3].device_ptr;
+  void *dev_weight = nullptr;
+  void *dev_mean_dy = nullptr;
+  void *dev_mean_dy_xmu = nullptr;
+  void *dev_diff_x = nullptr;
+  if (parser_->getInputNum() == 7) {
+    dev_weight = data_vector_[4].device_ptr;
+    dev_mean_dy = data_vector_[5].device_ptr;
+    dev_mean_dy_xmu = data_vector_[6].device_ptr;
+    dev_diff_x = data_vector_[7].device_ptr;
+  } else {
+    dev_mean_dy = data_vector_[4].device_ptr;
+    dev_mean_dy_xmu = data_vector_[5].device_ptr;
+    dev_diff_x = data_vector_[6].device_ptr;
+  }
+
+  VLOG(4) << "Start to run mluOpSyncBatchNormBackwardElemt().";
+  interface_timer_.start();
+  MLUOP_CHECK(mluOpSyncBatchNormBackwardElemt(
+      handle_, diff_y_desc, dev_diff_y, x_desc, dev_x, mean_desc, dev_mean,
+      invstd_desc, dev_invstd, weight_desc, dev_weight, mean_dy_desc,
+      dev_mean_dy, mean_dy_xmu_desc, dev_mean_dy_xmu, diff_x_desc, dev_diff_x));
+  interface_timer_.stop();
+  VLOG(4) << "mluOpSyncBatchNormBackwardElemt() end";
+}
+
+void SyncBatchNormBackwardElemtExecutor::cpuCompute() {
+  int len_x = parser_->getInputDataCount(0);
+  int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1];
+
+  if (len_x == 0 || len_c == 0) {
+    VLOG(4) << "SyncBatchNormBackwardElemtExecutor: cpu compute zero elemt";
+    return;
+  }
+
+  VLOG(4) << "Start to run cpuSyncBatchNormBackwardElemt().";
+
+  float *cpu_diff_y = cpu_fp32_input_[0];
+  float *cpu_x = cpu_fp32_input_[1];
+  float *cpu_mean = cpu_fp32_input_[2];
+  float *cpu_invstd = cpu_fp32_input_[3];
+  float *cpu_weight = nullptr;
+  float *cpu_mean_dy = nullptr;
+  float *cpu_mean_dy_xmu = nullptr;
+  float *cpu_diff_x = cpu_fp32_output_[0];
+  if (parser_->getInputNum() == 7) {
+    cpu_weight = cpu_fp32_input_[4];
+    cpu_mean_dy = cpu_fp32_input_[5];
+    cpu_mean_dy_xmu = cpu_fp32_input_[6];
+  } else {
+    cpu_mean_dy = cpu_fp32_input_[4];
+    cpu_mean_dy_xmu = cpu_fp32_input_[5];
+  }
+
+  cpuSyncBatchNormBackwardElemt(cpu_x, cpu_diff_y, cpu_weight, cpu_mean,
+                                cpu_invstd, cpu_mean_dy, cpu_mean_dy_xmu,
+                                cpu_diff_x, len_x, len_c);
+  VLOG(4) << "cpuSyncBatchNormBackwardElemt() end";
+}
+
+int64_t SyncBatchNormBackwardElemtExecutor::getTheoryOps() {
+  int64_t theory_ops = 0;
+  int len_x = parser_->getInputDataCount(0);
+  int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1];
+  if (parser_->getInputNum() == 7) {
+    theory_ops = 5 * len_x + 3 * len_c;
+  } else {
+    theory_ops = 5 * len_x + 2 * len_c;
+  }
+
+  VLOG(4) << "SyncBatchNormBackwardElemtExecutor: getTheoryOps: " << theory_ops
+          << " ops";
+  return theory_ops;
+}
+
+}  // namespace mluoptest
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.h
new file mode 100644
index 000000000..a2d251328
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.h
@@ -0,0 +1,47 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCH_NORM_BACKWARD_ELEMT_SYNC_\
+BATCH_NORM_BACKWARD_ELEMT_H_
+#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCH_NORM_BACKWARD_ELEMT_SYNC_\
+BATCH_NORM_BACKWARD_ELEMT_H_
+
+#include "executor.h"
+
+namespace mluoptest {
+
+class SyncBatchNormBackwardElemtExecutor : public Executor {
+ public:
+  SyncBatchNormBackwardElemtExecutor() {}
+  ~SyncBatchNormBackwardElemtExecutor() {}
+
+  void paramCheck();
+  void compute();
+  void cpuCompute();
+  int64_t getTheoryOps() override;
+};
+
+}  // namespace mluoptest
+
+#endif  // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCH_NORM_BACKWARD_ELEMT_SYNC_\
+BATCH_NORM_BACKWARD_ELEMT_H_
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/test_case/case_0.prototxt
new file mode 100644
index 000000000..1f6ac081d
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batch_norm_backward_elemt/test_case/case_0.prototxt
@@ -0,0 +1,124 @@
+op_name: "sync_batch_norm_backward_elemt"
+op_type: "SYNC_BATCHNORM_BACKWARD_ELEMT"
+input {
+  id: "diff_y"
+  shape: {
+    dims: 1
+    dims: 10
+    dims: 128
+    dims: 128
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: -2.0
+    lower_bound: 2.0
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "x"
+  shape: {
+    dims: 1
+    dims: 10
+    dims: 128
+    dims: 128
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 243
+    upper_bound: 2.0
+    lower_bound: -2.0
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mean"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: 0.5
+    lower_bound: -0.25
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "invstd"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: 0.5
+    lower_bound: -0.5
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "weight"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: 1.0
+    lower_bound: 0.5
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mean_dy"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: 1.0
+    lower_bound: 0.5
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mean_dy_xmu"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233 
+    upper_bound: 1.0
+    lower_bound: 0.5
+    distribution: UNIFORM 
+  }
+}
+output {
+  id: "diff_x"
+  shape: {
+    dims: 1
+    dims: 10
+    dims: 128
+    dims: 128
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+}
+test_param: {
+  error_func: DIFF1
+  error_func: DIFF2
+  error_threshold: 0.003
+  error_threshold: 0.003
+  baseline_device: CPU
+}
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
new file mode 100644
index 000000000..0c67b8520
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
@@ -0,0 +1,173 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "sync_batchnorm_backward_elemt_v2.h"
+
+namespace mluoptest {
+
+void cpuSyncBatchnormBackwardElemt(const float *diff_y, const float *x,
+                                   const float *mean, const float *invstd,
+                                   const float *weight, const float *sum_dy,
+                                   const float *sum_dy_xmu, const int32_t sum,
+                                   float *diff_x, const int len_x,
+                                   const int len_c) {
+  int len_nhw = len_x / len_c;
+  for (int ci = 0; ci < len_c; ++ci) {
+    float sum_dy_temp = sum_dy[ci] / sum;
+    float sum_dy_xmu_temp = sum_dy_xmu[ci] / sum;
+    for (int i = 0; i < len_nhw; ++i) {
+      if (weight == nullptr) {
+        diff_x[i * len_c + ci] = (diff_y[i * len_c + ci] - sum_dy_temp -
+                                  (x[i * len_c + ci] - mean[ci]) * invstd[ci] *
+                                      invstd[ci] * sum_dy_xmu_temp) *
+                                 invstd[ci];
+      } else {
+        diff_x[i * len_c + ci] = (diff_y[i * len_c + ci] - sum_dy_temp -
+                                  (x[i * len_c + ci] - mean[ci]) * invstd[ci] *
+                                      invstd[ci] * sum_dy_xmu_temp) *
+                                 weight[ci] * invstd[ci];
+      }
+    }
+  }
+}
+
+void SyncBatchnormBackwardElemtV2Executor::paramCheck() {
+  GTEST_CHECK(parser_->getInputNum() == 7 || parser_->getInputNum() == 8,
+              "SyncBatchnormBackwardElemtV2Executor: input number is wrong.");
+  GTEST_CHECK(parser_->getOutputNum() == 1,
+              "SyncBatchnormBackwardElemtV2Executor: output number is wrong.");
+}
+
+void SyncBatchnormBackwardElemtV2Executor::compute() {
+  mluOpTensorDescriptor_t x_desc, diff_y_desc, diff_x_desc, count_desc;
+  mluOpTensorDescriptor_t mean_desc, invstd_desc, weight_desc, sum_dy_desc,
+      sum_dy_xmu_desc;
+
+  diff_y_desc = tensor_desc_[0].tensor;
+  x_desc = tensor_desc_[1].tensor;
+  mean_desc = tensor_desc_[2].tensor;
+  invstd_desc = tensor_desc_[3].tensor;
+  if (parser_->getInputNum() == 8) {
+    weight_desc = tensor_desc_[4].tensor;
+    sum_dy_desc = tensor_desc_[5].tensor;
+    sum_dy_xmu_desc = tensor_desc_[6].tensor;
+    count_desc = tensor_desc_[7].tensor;
+    diff_x_desc = tensor_desc_[8].tensor;
+  } else {
+    weight_desc = nullptr;
+    sum_dy_desc = tensor_desc_[4].tensor;
+    sum_dy_xmu_desc = tensor_desc_[5].tensor;
+    count_desc = tensor_desc_[6].tensor;
+    diff_x_desc = tensor_desc_[7].tensor;
+  }
+
+  void *dev_diff_y = data_vector_[0].device_ptr;
+  void *dev_x = data_vector_[1].device_ptr;
+  void *dev_mean = data_vector_[2].device_ptr;
+  void *dev_invstd = data_vector_[3].device_ptr;
+  void *dev_weight = nullptr;
+  void *dev_sum_dy = nullptr;
+  void *dev_sum_dy_xmu = nullptr;
+  void *dev_count = nullptr;
+  void *dev_diff_x = nullptr;
+  if (parser_->getInputNum() == 8) {
+    dev_weight = data_vector_[4].device_ptr;
+    dev_sum_dy = data_vector_[5].device_ptr;
+    dev_sum_dy_xmu = data_vector_[6].device_ptr;
+    dev_count = data_vector_[7].device_ptr;
+    dev_diff_x = data_vector_[8].device_ptr;
+  } else {
+    dev_sum_dy = data_vector_[4].device_ptr;
+    dev_sum_dy_xmu = data_vector_[5].device_ptr;
+    dev_count = data_vector_[6].device_ptr;
+    dev_diff_x = data_vector_[7].device_ptr;
+  }
+
+  VLOG(4) << "Start to run mluOpSyncBatchnormBackwardElemt_v2().";
+  interface_timer_.start();
+  MLUOP_CHECK(mluOpSyncBatchNormBackwardElemtV2(
+      handle_, diff_y_desc, dev_diff_y, x_desc, dev_x, mean_desc, dev_mean,
+      invstd_desc, dev_invstd, weight_desc, dev_weight, sum_dy_desc, dev_sum_dy,
+      sum_dy_xmu_desc, dev_sum_dy_xmu, count_desc, dev_count, diff_x_desc,
+      dev_diff_x));
+  interface_timer_.stop();
+  VLOG(4) << "mluOpSyncBatchnormBackwardElemt_v2() end";
+}
+
+void SyncBatchnormBackwardElemtV2Executor::cpuCompute() {
+  int len_x = parser_->getInputDataCount(0);
+  int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1];
+  int len_n = tensor_desc_[0].tensor->dims[0];
+
+  if (len_x == 0 || len_c == 0) {
+    VLOG(4) << "SyncBatchnormBackwardElemtV2Executor: cpu compute zero elemt";
+    return;
+  }
+
+  VLOG(4) << "Start to run cpuSyncBatchnormBackwardElemt().";
+
+  float *cpu_diff_y = cpu_fp32_input_[0];
+  float *cpu_x = cpu_fp32_input_[1];
+  float *cpu_mean = cpu_fp32_input_[2];
+  float *cpu_invstd = cpu_fp32_input_[3];
+  float *cpu_weight = nullptr;
+  float *cpu_sum_dy = nullptr;
+  float *cpu_sum_dy_xmu = nullptr;
+  float *cpu_count = nullptr;
+  float *cpu_diff_x = cpu_fp32_output_[0];
+  if (parser_->getInputNum() == 8) {
+    cpu_weight = cpu_fp32_input_[4];
+    cpu_sum_dy = cpu_fp32_input_[5];
+    cpu_sum_dy_xmu = cpu_fp32_input_[6];
+    cpu_count = cpu_fp32_input_[7];
+  } else {
+    cpu_sum_dy = cpu_fp32_input_[4];
+    cpu_sum_dy_xmu = cpu_fp32_input_[5];
+    cpu_count = cpu_fp32_input_[6];
+  }
+  int sum = 0;
+  for (int k = 0; k < len_n; k++) {
+    sum += (int32_t)(cpu_count[k]);
+  }
+
+  cpuSyncBatchnormBackwardElemt(cpu_diff_y, cpu_x, cpu_mean, cpu_invstd,
+                                cpu_weight, cpu_sum_dy, cpu_sum_dy_xmu, sum,
+                                cpu_diff_x, len_x, len_c);
+  VLOG(4) << "cpuSyncBatchnormBackwardElemt() end";
+}
+
+int64_t SyncBatchnormBackwardElemtV2Executor::getTheoryOps() {
+  int64_t theory_ops = 0;
+  int len_x = parser_->getInputDataCount(0);
+  int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1];
+  if (parser_->getInputNum() == 7) {
+    theory_ops = 5 * len_x + 3 * len_c;
+  } else {
+    theory_ops = 5 * len_x + 2 * len_c;
+  }
+
+  VLOG(4) << "SyncBatchnormBackwardElemtV2Executor: getTheoryOps: "
+          << theory_ops << " ops";
+  return theory_ops;
+}
+
+}  // namespace mluoptest
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.h
new file mode 100644
index 000000000..6972d2b5d
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.h
@@ -0,0 +1,47 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_ELEMT_V2_\
+SYNC_BATCHNORM_BACKWARD_ELEMT_V2_H_
+#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_ELEMT_V2_\
+SYNC_BATCHNORM_BACKWARD_ELEMT_V2_H_
+
+#include "executor.h"
+
+namespace mluoptest {
+
+class SyncBatchnormBackwardElemtV2Executor : public Executor {
+ public:
+  SyncBatchnormBackwardElemtV2Executor() {}
+  ~SyncBatchnormBackwardElemtV2Executor() {}
+
+  void paramCheck();
+  void compute();
+  void cpuCompute();
+  int64_t getTheoryOps() override;
+};
+
+}  // namespace mluoptest
+
+#endif  // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_ELEMT_V2_\
+SYNC_BATCHNORM_BACKWARD_ELEMT_V2_H_
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/test_case/case_0.prototxt
new file mode 100644
index 000000000..b6a711532
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_elemt_v2/test_case/case_0.prototxt
@@ -0,0 +1,138 @@
+op_name: "sync_batchnorm_backward_elemt_v2"
+op_type: "SYNC_BATCHNORM_BACKWARD_ELEMT_V2"
+input {
+  id: "diff_y"
+  shape: {
+    dims: 1
+    dims: 10
+    dims: 128
+    dims: 128
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: -2.0
+    lower_bound: 2.0
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "x"
+  shape: {
+    dims: 1
+    dims: 10
+    dims: 128
+    dims: 128
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 243
+    upper_bound: 2.0
+    lower_bound: -2.0
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mean"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: 0.5
+    lower_bound: -0.25
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "invstd"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: 0.5
+    lower_bound: -0.5
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "weight"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: 1.0
+    lower_bound: 0.5
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "sum_dy"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233
+    upper_bound: 1.0
+    lower_bound: 0.5
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "sum_dy_xmu"
+  shape: {
+    dims: 128
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 233 
+    upper_bound: 1.0
+    lower_bound: 0.5
+    distribution: UNIFORM 
+  }
+}
+input {
+  id: "count"
+  shape: {
+    dims: 1
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_INT32
+  random_data: {
+    seed: 233 
+    upper_bound: 22
+    lower_bound: 2
+    distribution: UNIFORM 
+  }
+}
+output {
+  id: "diff_x"
+  shape: {
+    dims: 1
+    dims: 10
+    dims: 128
+    dims: 128
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+}
+test_param: {
+  error_func: DIFF1
+  error_func: DIFF2
+  error_threshold: 0.003
+  error_threshold: 0.003
+  baseline_device: CPU
+}
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
new file mode 100644
index 000000000..3b0fc7216
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
@@ -0,0 +1,296 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "sync_batchnorm_backward_reduce.h"
+
+namespace mluoptest {
+
+void SyncBatchnormBackwardReduceExecutor::paramCheck() {
+  GTEST_CHECK(parser_->node()->has_sync_batchnorm_backward_reduce_param(),
+              "Lose sync_batchnorm_backward_reduce param.");
+}
+
+void SyncBatchnormBackwardReduceExecutor::workspaceMalloc() {
+  auto tensor_x = tensor_desc_[1].tensor;
+  void *tmp = nullptr;
+  // allocate extra nram space for deletion of CDMA
+  MLUOP_CHECK(mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(
+      handle_, tensor_x, &workspace_size_));
+  if (workspace_size_ > 0) {
+    VLOG(4) << "Malloc workspace space for deletion of CDMA.";
+    tmp = mlu_runtime_.allocate(workspace_size_);
+    VLOG(4) << "Mallocated addr: " << tmp << ", size: " << workspace_size_;
+  } else {
+    VLOG(4) << "Don't need to Malloc workspace space.";
+  }
+  workspace_.push_back(tmp);
+  eva_->setMluWorkspaceSize(workspace_size_);
+}
+
+void SyncBatchnormBackwardReduceExecutor::workspaceFree() {
+  if (workspace_[0]) {
+    VLOG(4) << "Free device workspace space.";
+    mlu_runtime_.deallocate(workspace_[0]);
+  }
+}
+
+void SyncBatchnormBackwardReduceExecutor::compute() {
+  const bool needs_input_grad0 = parser_->getProtoNode()
+                                     ->sync_batchnorm_backward_reduce_param()
+                                     .needs_input_grad0();
+  const bool needs_input_grad1 = parser_->getProtoNode()
+                                     ->sync_batchnorm_backward_reduce_param()
+                                     .needs_input_grad1();
+  const bool needs_input_grad2 = parser_->getProtoNode()
+                                     ->sync_batchnorm_backward_reduce_param()
+                                     .needs_input_grad2();
+  // input tensor description
+  mluOpTensorDescriptor_t desc_dz = tensor_desc_[0].tensor;
+  mluOpTensorDescriptor_t desc_x = tensor_desc_[1].tensor;
+  mluOpTensorDescriptor_t desc_mean = tensor_desc_[2].tensor;
+  mluOpTensorDescriptor_t desc_invstd = tensor_desc_[3].tensor;
+  mluOpTensorDescriptor_t desc_sum_dy = NULL;
+  mluOpTensorDescriptor_t desc_sum_dy_xmu = NULL;
+  mluOpTensorDescriptor_t desc_dweight = NULL;
+  mluOpTensorDescriptor_t desc_dbias = NULL;
+
+  if (needs_input_grad0 == 1 && needs_input_grad1 == 0 &&
+      needs_input_grad2 == 0) {
+    GTEST_CHECK(parser_->outputs().size() == 2,
+                "[Output MISMATCHED]: Only sum_dy and sum_dy_xmu will be "
+                "compute currently.");
+  }
+  if (needs_input_grad0 == 0 && needs_input_grad1 == 1 &&
+      needs_input_grad2 == 0) {
+    GTEST_CHECK(parser_->outputs().size() == 1,
+                "[Output MISMATCHED]: Only dweight will be compute currently.");
+  }
+  if (needs_input_grad0 == 0 && needs_input_grad1 == 0 &&
+      needs_input_grad2 == 1) {
+    GTEST_CHECK(parser_->outputs().size() == 1,
+                "[Output MISMATCHED]: Only dbias will be compute currently.");
+  }
+  if (needs_input_grad0 == 1 && needs_input_grad1 == 1 &&
+      needs_input_grad2 == 0) {
+    GTEST_CHECK(parser_->outputs().size() == 3,
+                "[Output MISMATCHED]: Only sum_dy, sum_dy_xmu, dweight will be "
+                "compute currently.");
+  }
+  if (needs_input_grad0 == 1 && needs_input_grad1 == 0 &&
+      needs_input_grad2 == 1) {
+    GTEST_CHECK(parser_->outputs().size() == 3,
+                "[Output MISMATCHED]: Only sum_dy, sum_dy_xmu, dbias will be "
+                "compute currently.");
+  }
+  if (needs_input_grad0 == 0 && needs_input_grad1 == 1 &&
+      needs_input_grad2 == 1) {
+    GTEST_CHECK(parser_->outputs().size() == 2,
+                "[Output MISMATCHED]: Only dweight and dbias will be compute "
+                "currently.");
+  }
+  if (needs_input_grad0 == 1 && needs_input_grad1 == 1 &&
+      needs_input_grad2 == 1) {
+    GTEST_CHECK(parser_->outputs().size() == 4,
+                "[Output MISMATCHED]: All of the four outputs will be compute "
+                "currently.");
+  }
+  // input pointer for device
+  void *dev_dz = data_vector_[0].device_ptr;
+  void *dev_x = data_vector_[1].device_ptr;
+  void *dev_mean = data_vector_[2].device_ptr;
+  void *dev_invstd = data_vector_[3].device_ptr;
+  void *dev_sum_dy = NULL;
+  void *dev_sum_dy_xmu = NULL;
+  void *dev_dweight = NULL;
+  void *dev_dbias = NULL;
+
+  if (needs_input_grad0) {
+    desc_sum_dy = tensor_desc_[5].tensor;
+    desc_sum_dy_xmu = tensor_desc_[6].tensor;
+    dev_sum_dy = data_vector_[5].device_ptr;
+    dev_sum_dy_xmu = data_vector_[6].device_ptr;
+    if (needs_input_grad1) {
+      desc_dweight = tensor_desc_[7].tensor;
+      dev_dweight = data_vector_[7].device_ptr;
+      if (needs_input_grad2) {
+        desc_dbias = tensor_desc_[8].tensor;
+        dev_dbias = data_vector_[8].device_ptr;
+      }
+    } else {
+      if (needs_input_grad2) {
+        desc_dbias = tensor_desc_[7].tensor;
+        dev_dbias = data_vector_[7].device_ptr;
+      }
+    }
+  } else {
+    if (needs_input_grad1) {
+      desc_dweight = tensor_desc_[5].tensor;
+      dev_dweight = data_vector_[5].device_ptr;
+      if (needs_input_grad2) {
+        desc_dbias = tensor_desc_[6].tensor;
+        dev_dbias = data_vector_[6].device_ptr;
+      }
+    } else {
+      if (needs_input_grad2) {
+        desc_dbias = tensor_desc_[5].tensor;
+        dev_dbias = data_vector_[5].device_ptr;
+      }
+    }
+  }
+
+  VLOG(4) << "Start to run mluOpSyncBatchNormBackwardReduce().";
+  interface_timer_.start();
+#if 1
+  VLOG(4) << "launch mluOpSyncBatchnormBackwardReduce_v2.";
+  MLUOP_CHECK(mluOpSyncBatchnormBackwardReduce_v2(
+      handle_, desc_dz, dev_dz, desc_x, dev_x, desc_mean, dev_mean, desc_invstd,
+      dev_invstd, workspace_[0], workspace_size_, desc_dweight, dev_dweight,
+      desc_dbias, dev_dbias, desc_sum_dy, dev_sum_dy, desc_sum_dy_xmu,
+      dev_sum_dy_xmu, needs_input_grad0, needs_input_grad1, needs_input_grad2));
+#else
+  VLOG(4) << "launch mluOpSyncBatchnormBackwardReduce.";
+  MLUOP_CHECK(mluOpSyncBatchnormBackwardReduce(
+      handle_, desc_dz, dev_dz, desc_x, dev_x, desc_mean, dev_mean, desc_invstd,
+      dev_invstd, desc_dweight, dev_dweight, desc_dbias, dev_dbias, desc_sum_dy,
+      dev_sum_dy, desc_sum_dy_xmu, dev_sum_dy_xmu, needs_input_grad0,
+      needs_input_grad1, needs_input_grad2));
+#endif
+
+  interface_timer_.stop();
+}
+
+void cpuGetSyncBnBkwReduceOuput(
+    const float *x, const float *diff_z, const float *mean, const float *invstd,
+    float *diff_weight, float *diff_bias, float *sum_dy, float *sum_dy_xmu,
+    const int len_x, const int len_c, const bool needs_input_grad0,
+    const bool needs_input_grad1, const bool needs_input_grad2) {
+  if (len_x == 0 || len_c == 0) {
+    LOG(ERROR) << "SyncBnBackwardReduce: the element number of input tensor "
+                  "should not be zero";
+    return;
+  }
+  int len_nhw = len_x / len_c;
+  float *x_hat = new float[len_x];
+  float *xmu = new float[len_x];
+
+  for (int ci = 0; ci < len_c; ++ci) {
+    const float *xc = x + ci;
+    float *x_hat_c = x_hat + ci;
+    float *xmu_c = xmu + ci;
+    for (int xi = 0; xi < len_nhw; ++xi) {
+      xmu_c[xi * len_c] = xc[xi * len_c] - mean[ci];
+      x_hat_c[xi * len_c] = xmu_c[xi * len_c] * invstd[ci];
+    }
+  }
+
+  for (int ci = 0; ci < len_c; ++ci) {
+    const float *x_hat_c = x_hat + ci;
+    const float *xmu_c = xmu + ci;
+    const float *dzc = diff_z + ci;
+    double dweight = 0, dbias = 0, meandyxmu = 0;
+    for (int i = 0; i < len_nhw; i++) {
+      dweight = dweight + x_hat_c[i * len_c] * dzc[i * len_c];
+      dbias = dbias + dzc[i * len_c];
+      meandyxmu = meandyxmu + xmu_c[i * len_c] * dzc[i * len_c];
+    }
+    if (needs_input_grad0 == true) {
+      // diff_weight[ci] = dweight;
+      // diff_bias[ci] = dbias;
+      sum_dy[ci] = dbias;
+      sum_dy_xmu[ci] = meandyxmu;
+    }
+    if (needs_input_grad1 == true) {
+      diff_weight[ci] = dweight;
+    }
+    if (needs_input_grad2 == true) {
+      diff_bias[ci] = dbias;
+    }
+  }
+  delete[] x_hat;
+  delete[] xmu;
+}
+
+void SyncBatchnormBackwardReduceExecutor::cpuCompute() {
+  int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1];
+  int len_x = parser_->getInputDataCount(0);
+  const bool needs_input_grad0 = parser_->getProtoNode()
+                                     ->sync_batchnorm_backward_reduce_param()
+                                     .needs_input_grad0();
+  const bool needs_input_grad1 = parser_->getProtoNode()
+                                     ->sync_batchnorm_backward_reduce_param()
+                                     .needs_input_grad1();
+  const bool needs_input_grad2 = parser_->getProtoNode()
+                                     ->sync_batchnorm_backward_reduce_param()
+                                     .needs_input_grad2();
+
+  auto tensor_dz = cpu_fp32_input_[0];
+  auto tensor_x = cpu_fp32_input_[1];
+  auto tensor_mean = cpu_fp32_input_[2];
+  auto tensor_invstd = cpu_fp32_input_[3];
+
+  auto tensor_sum_dy = cpu_fp32_output_[0];
+  auto tensor_sum_dy_xmu = cpu_fp32_output_[1];
+  auto tensor_dweight = cpu_fp32_output_[2];
+  auto tensor_dbias = cpu_fp32_output_[3];
+  if (needs_input_grad0) {
+    tensor_sum_dy = cpu_fp32_output_[0];
+    tensor_sum_dy_xmu = cpu_fp32_output_[1];
+    if (needs_input_grad1) {
+      tensor_dweight = cpu_fp32_output_[2];
+      if (needs_input_grad2) {
+        tensor_dbias = cpu_fp32_output_[3];
+      }
+    } else {
+      if (needs_input_grad2) {
+        tensor_dbias = cpu_fp32_output_[2];
+      }
+    }
+  } else {
+    if (needs_input_grad1) {
+      tensor_dweight = cpu_fp32_output_[0];
+      if (needs_input_grad2) {
+        tensor_dbias = cpu_fp32_output_[1];
+      }
+    } else {
+      if (needs_input_grad2) {
+        tensor_dbias = cpu_fp32_output_[0];
+      }
+    }
+  }
+
+  // const bool needs_input_grad[3] = {1,1,1};
+  // call the cup compute function to get:-> grad weight, grad bias, sum_dy,
+  // sum_dy_xmu
+  cpuGetSyncBnBkwReduceOuput(tensor_x, tensor_dz, tensor_mean, tensor_invstd,
+                             tensor_dweight, tensor_dbias, tensor_sum_dy,
+                             tensor_sum_dy_xmu, len_x, len_c, needs_input_grad0,
+                             needs_input_grad1, needs_input_grad2);
+}
+
+int64_t SyncBatchnormBackwardReduceExecutor::getTheoryOps() {
+  int cp_count = 8;
+  int64_t theory_ops = parser_->getOutputDataCount(0) * cp_count;
+  VLOG(4) << "getTheoryOps: " << theory_ops << " ops";
+  return theory_ops;
+}
+
+}  // namespace mluoptest
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.h
new file mode 100644
index 000000000..52eb32a3b
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.h
@@ -0,0 +1,48 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_REDUCE_\
+SYNC_BATCHNORM_BACKWARD_REDUCE_H_
+#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_REDUCE_\
+SYNC_BATCHNORM_BACKWARD_REDUCE_H_
+#include "executor.h"
+
+namespace mluoptest {
+class SyncBatchnormBackwardReduceExecutor : public Executor {
+ public:
+  SyncBatchnormBackwardReduceExecutor() {}
+  ~SyncBatchnormBackwardReduceExecutor() {}
+
+  void paramCheck();
+  void workspaceMalloc();
+  void workspaceFree();
+  void compute();
+  void cpuCompute();
+  int64_t getTheoryOps() override;
+
+ private:
+  size_t workspace_size_ = 0;
+};
+
+}  // namespace mluoptest
+#endif  // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_BACKWARD_REDUCE_\
+SYNC_BATCHNORM_BACKWARD_REDUCE_H_
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/test_case/case_0.prototxt
new file mode 100644
index 000000000..d62f473f3
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/test_case/case_0.prototxt
@@ -0,0 +1,122 @@
+op_name: "sync_batchnorm_backward_reduce"
+op_type: "SYNC_BATCHNORM_BACKWARD_REDUCE"
+input{
+    id:"dz"
+    shape:{
+        dims: 55
+        dims: 14
+        dims: 14
+        dims: 2000
+    }
+    layout:LAYOUT_NHWC
+    dtype: DTYPE_FLOAT
+    random_data:{
+        seed:4
+        upper_bound:5.5
+        lower_bound:0.1
+        distribution:UNIFORM
+    }
+}
+input{
+    id:"x"
+    shape:{
+        dims: 55
+        dims: 14
+        dims: 14
+        dims: 2000
+    }
+    layout:LAYOUT_NHWC
+    dtype:DTYPE_FLOAT
+    random_data:{
+        seed:233
+        upper_bound:5
+        lower_bound:0.5
+        distribution: UNIFORM
+    }
+}
+input{
+    id:"mean"
+    shape:{
+        dims: 2000
+    }
+    layout:LAYOUT_ARRAY
+    dtype:DTYPE_FLOAT
+    random_data:{
+        seed:233
+        upper_bound:5
+        lower_bound:1
+        distribution: UNIFORM
+    }
+}
+input{
+    id:"invstd"
+    shape:{
+        dims: 2000
+    }
+    layout:LAYOUT_ARRAY
+    dtype:DTYPE_FLOAT
+    random_data:{
+        seed:233
+        upper_bound:8
+        lower_bound:0.8
+        distribution: UNIFORM
+    }
+}
+input{
+    id:"weight"
+    shape:{
+        dims: 2000
+    }
+    layout:LAYOUT_ARRAY
+    dtype:DTYPE_FLOAT
+    random_data:{
+        seed:233
+        upper_bound:8
+        lower_bound:0.8
+        distribution: UNIFORM
+    }
+}
+output{
+    id:"sum_dy"
+    shape:{
+        dims:2000
+    }
+    layout:LAYOUT_ARRAY
+    dtype:DTYPE_FLOAT
+}
+output{
+    id: "sum_dy_xmu"
+    shape:{
+        dims:2000
+    }
+    layout:LAYOUT_ARRAY
+    dtype:DTYPE_FLOAT
+}
+output{
+    id: "dweight"
+    shape:{
+        dims:2000
+    }
+    layout:LAYOUT_ARRAY
+    dtype:DTYPE_FLOAT
+}
+output{
+    id: "dbias"
+    shape:{
+        dims:2000
+    }
+    layout:LAYOUT_ARRAY
+    dtype:DTYPE_FLOAT
+}
+sync_batchnorm_backward_reduce_param: {
+    needs_input_grad0: true
+    needs_input_grad1: true
+    needs_input_grad2: true
+}
+test_param:{
+    error_func: DIFF1
+    error_func: DIFF2
+    error_threshold: 0.003
+    error_threshold: 0.003
+    baseline_device: CPU
+}
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
new file mode 100644
index 000000000..c1db5a4a6
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
@@ -0,0 +1,118 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "sync_batchnorm_elemt.h"
+
+namespace mluoptest {
+
+void SyncBatchnormElemtExecutor::paramCheck() {
+  GTEST_CHECK(parser_->getInputNum() == 3 || parser_->getInputNum() == 5,
+              "SyncBatchnormElemtExecutor: input number is wrong.");
+  GTEST_CHECK(parser_->getOutputNum() == 1,
+              "SyncBatchnormElemtExecutor: output number is wrong.");
+}
+
+void SyncBatchnormElemtExecutor::compute() {
+  VLOG(4) << "SyncBatchnormElemtExecutor compute begin";
+  auto x_desc = tensor_desc_[0].tensor;
+  auto dev_x = data_vector_[0].device_ptr;
+  auto mean_desc = tensor_desc_[1].tensor;
+  auto dev_mean = data_vector_[1].device_ptr;
+  auto invstd_desc = tensor_desc_[2].tensor;
+  auto dev_invstd = data_vector_[2].device_ptr;
+
+  if (parser_->getInputNum() == 3) {
+    auto y_desc = tensor_desc_[3].tensor;
+    auto dev_y = data_vector_[3].device_ptr;
+    interface_timer_.start();
+    MLUOP_CHECK(mluOpSyncBatchNormElemt(
+        handle_, x_desc, dev_x, mean_desc, dev_mean, invstd_desc, dev_invstd,
+        nullptr, nullptr, nullptr, nullptr, y_desc, dev_y));
+    interface_timer_.stop();
+  } else if (parser_->getInputNum() == 5) {
+    auto weight_desc = tensor_desc_[3].tensor;
+    auto dev_weight = data_vector_[3].device_ptr;
+    auto bias_desc = tensor_desc_[4].tensor;
+    auto dev_bias = data_vector_[4].device_ptr;
+    auto y_desc = tensor_desc_[5].tensor;
+    auto dev_y = data_vector_[5].device_ptr;
+    interface_timer_.start();
+    MLUOP_CHECK(mluOpSyncBatchNormElemt(
+        handle_, x_desc, dev_x, mean_desc, dev_mean, invstd_desc, dev_invstd,
+        weight_desc, dev_weight, bias_desc, dev_bias, y_desc, dev_y));
+    interface_timer_.stop();
+  }
+  VLOG(4) << "SyncBatchnormElemtExecutor compute end";
+}
+
+void cpuSyncBNElemt(const float *x, const float *mean, const float *invstd,
+                    float *weight, float *bias, float *y, const int len_x,
+                    const int len_c) {
+  int len_nhw = len_x / len_c;
+
+  for (int h = 0; h < len_nhw; ++h) {
+    for (int c = 0; c < len_c; ++c) {
+      y[h * len_c + c] = (x[h * len_c + c] - mean[c]) * invstd[c];
+      if (weight != nullptr && bias != nullptr) {
+        y[h * len_c + c] = y[h * len_c + c] * weight[c] + bias[c];
+      }
+    }
+  }
+}
+
+void SyncBatchnormElemtExecutor::cpuCompute() {
+  int len_c = tensor_desc_[0].tensor->dims[tensor_desc_[0].tensor->dim - 1];
+  int len_x = parser_->getInputDataCount(0);
+
+  VLOG(4) << "SyncBatchnormElemtExecutor: cpu compute begin";
+  // actually len_c = 0, then len_x must be 0
+  if (len_c == 0 || len_x == 0) {
+    VLOG(4) << "SyncBatchnormElemtExecutor: cpu compute zero elemt";
+    return;
+  }
+
+  if (parser_->getInputNum() == 3) {
+    VLOG(4) << "weight and bias is nullptr";
+    cpuSyncBNElemt(cpu_fp32_input_[0], cpu_fp32_input_[1], cpu_fp32_input_[2],
+                   nullptr, nullptr, cpu_fp32_output_[0], len_x, len_c);
+  } else if (parser_->getInputNum() == 5) {
+    cpuSyncBNElemt(cpu_fp32_input_[0], cpu_fp32_input_[1], cpu_fp32_input_[2],
+                   cpu_fp32_input_[3], cpu_fp32_input_[4], cpu_fp32_output_[0],
+                   len_x, len_c);
+  }
+  VLOG(4) << "SyncBatchnormElemtExecutor: cpu compute end";
+}
+
+int64_t SyncBatchnormElemtExecutor::getTheoryOps() {
+  int64_t theory_ops = 0;
+  int len_x = parser_->getInputDataCount(0);
+  if (parser_->getInputNum() == 3) {
+    theory_ops = len_x * 2;
+  } else if (parser_->getInputNum() == 5) {
+    theory_ops = len_x * 4;
+  }
+  VLOG(4) << "SyncBatchnormElemtExecutor: getTheoryOps: " << theory_ops
+          << " ops";
+  return theory_ops;
+}
+
+}  // namespace mluoptest
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.h
new file mode 100644
index 000000000..270877072
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/sync_batchnorm_elemt.h
@@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_ELEMT_SYNC_BATCHNORM_ELEMT_H_
+#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_ELEMT_SYNC_BATCHNORM_ELEMT_H_
+
+#include "executor.h"
+
+namespace mluoptest {
+
+class SyncBatchnormElemtExecutor : public Executor {
+ public:
+  SyncBatchnormElemtExecutor() {}
+  ~SyncBatchnormElemtExecutor() {}
+
+  void paramCheck();
+  void compute();
+  void cpuCompute();
+  int64_t getTheoryOps() override;
+};
+
+}  // namespace mluoptest
+
+#endif  // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_ELEMT_\
+SYNC_BATCHNORM_ELEMT_H_
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/test_case/case_0.prototxt
new file mode 100644
index 000000000..45772bbba
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_elemt/test_case/case_0.prototxt
@@ -0,0 +1,93 @@
+op_name: "sync_batchnorm_elemt"
+op_type: "SYNC_BATCHNORM_ELEMT"
+input {
+  id: "x"
+  shape: {
+    dims: 4
+    dims: 14
+    dims: 14
+    dims: 1025
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 32
+    upper_bound: 2.0
+    lower_bound: 1.0
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mean"
+  shape: {
+    dims: 1025
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 35
+    upper_bound: 1.5
+    lower_bound: 0.5
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "invstd"
+  shape: {
+    dims: 1025
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 36
+    upper_bound: 1.25
+    lower_bound: 0.25
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "weight"
+  shape: {
+    dims: 1025
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 33
+    upper_bound: 2.0
+    lower_bound: -2.0
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "bias"
+  shape: {
+    dims: 1025
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 34
+    upper_bound: 1.0
+    lower_bound: -1.0
+    distribution: UNIFORM
+  }
+}
+output {
+  id: "y"
+  shape: {
+    dims: 4
+    dims: 14
+    dims: 14
+    dims: 1025
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+}
+test_param: {
+  error_func: DIFF1
+  error_func: DIFF2
+  error_threshold: 0.003
+  error_threshold: 0.003
+  baseline_device: CPU
+}
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
new file mode 100644
index 000000000..3582585f9
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
@@ -0,0 +1,210 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "sync_batchnorm_gather_stats_with_counts.h"
+
+namespace mluoptest {
+
+void SyncBatchnormGatherStatsWithCountsExecutor::paramCheck() {
+  GTEST_CHECK(parser_->getProtoNode()
+              ->has_sync_batchnorm_gather_stats_with_counts_param(),
+              "Lose sync_batchnorm_gather_stats_with_counts param.");
+
+  // set flag
+  flag_input_reuse_ = false;
+}
+
+void SyncBatchnormGatherStatsWithCountsExecutor::compute() {
+  float eps = parser_->getProtoNode()
+                  ->sync_batchnorm_gather_stats_with_counts_param()
+                  .eps();
+  float momentum = parser_->getProtoNode()
+                       ->sync_batchnorm_gather_stats_with_counts_param()
+                       .momentum();
+
+  mluOpTensorDescriptor_t mean_all_desc;
+  mluOpTensorDescriptor_t invstd_all_desc;
+  mluOpTensorDescriptor_t count_all_desc;
+  mean_all_desc = tensor_desc_[1].tensor;
+  invstd_all_desc = tensor_desc_[2].tensor;
+
+  // if num_inputs = 4, then [input, mean_all, invstd_all, count_all] -> [mean,
+  // invstd] if num_inputs = 6, then [input, mean_all, invstd_all, moving_mean,
+  // moving_var, count_all]
+  // -> [moving_mean, moving_var, mean, invstd]
+  VLOG(4) << "Start to run mluOpSyncBatchNormGatherStatsWithCounts().";
+  if (parser_->getInputNum() == 4) {
+    count_all_desc = tensor_desc_[3].tensor;
+    mluOpTensorDescriptor_t mean_desc = tensor_desc_[4].tensor;
+    mluOpTensorDescriptor_t invstd_desc = tensor_desc_[5].tensor;
+    interface_timer_.start();
+    MLUOP_CHECK(mluOpSyncBatchNormGatherStatsWithCounts(
+        handle_, mean_all_desc, data_vector_[1].device_ptr, invstd_all_desc,
+        data_vector_[2].device_ptr, nullptr, nullptr, nullptr, nullptr,
+        momentum, eps, count_all_desc, data_vector_[3].device_ptr, mean_desc,
+        data_vector_[4].device_ptr, invstd_desc, data_vector_[5].device_ptr));
+    interface_timer_.stop();
+  } else if (parser_->getInputNum() == 6) {
+    mluOpTensorDescriptor_t moving_mean_desc = tensor_desc_[3].tensor;
+    mluOpTensorDescriptor_t moving_var_desc = tensor_desc_[4].tensor;
+    count_all_desc = tensor_desc_[5].tensor;
+    if (parser_->getOutputNum() == 2) {
+      mluOpTensorDescriptor_t mean_desc = tensor_desc_[6].tensor;
+      mluOpTensorDescriptor_t invstd_desc = tensor_desc_[7].tensor;
+      interface_timer_.start();
+      MLUOP_CHECK(mluOpSyncBatchNormGatherStatsWithCounts(
+          handle_, mean_all_desc, data_vector_[1].device_ptr, invstd_all_desc,
+          data_vector_[2].device_ptr, moving_mean_desc,
+          data_vector_[3].device_ptr, moving_var_desc,
+          data_vector_[4].device_ptr, momentum, eps, count_all_desc,
+          data_vector_[5].device_ptr, mean_desc, data_vector_[6].device_ptr,
+          invstd_desc, data_vector_[7].device_ptr));
+      interface_timer_.stop();
+    } else {
+      mluOpTensorDescriptor_t mean_desc = tensor_desc_[8].tensor;
+      mluOpTensorDescriptor_t invstd_desc = tensor_desc_[9].tensor;
+      interface_timer_.start();
+      MLUOP_CHECK(mluOpSyncBatchNormGatherStatsWithCounts(
+          handle_, mean_all_desc, data_vector_[1].device_ptr, invstd_all_desc,
+          data_vector_[2].device_ptr, moving_mean_desc,
+          data_vector_[3].device_ptr, moving_var_desc,
+          data_vector_[4].device_ptr, momentum, eps, count_all_desc,
+          data_vector_[5].device_ptr, mean_desc, data_vector_[8].device_ptr,
+          invstd_desc, data_vector_[9].device_ptr));
+      interface_timer_.stop();
+
+      data_vector_[3].is_output = true;
+      data_vector_[4].is_output = true;
+      data_vector_[6].is_output = false;
+      data_vector_[7].is_output = false;
+    }
+  }
+}
+
+void kahan(float input, float &sum, float &delta) {
+  float y = input - delta;
+  float t = sum + y;
+  delta = t - sum - y;
+  sum = t;
+}
+
+void cpuBatchNormForwardTraining(float *mean_all, float *invstd_all,
+                                 float *moving_mean, float *moving_var,
+                                 const float momentum, const float eps,
+                                 float *count_all, float *m_mean, float *m_var,
+                                 float *mean, float *invstd,
+                                 const int len_mean_all, const int len_c,
+                                 const int output_num) {
+  int len_n = len_mean_all / len_c;
+  int len_all = 0;
+  for (int i = 0; i < len_n; ++i) {
+    len_all += count_all[i];
+  }
+
+  // B.P.Welford algo
+  for (int ci = 0; ci < len_c; ++ci) {
+    float c_sum = 0.0, c_ssum = 0.0;
+    const float *meanc = mean_all + ci;
+    const float *invstdc = invstd_all + ci;
+    float sum = 0.0, ssum = 0.0, temp = 0.0;
+    for (int xi = 0; xi < len_n; ++xi) {
+      kahan(meanc[xi * len_c] * count_all[xi], sum, c_sum);
+      temp = 1.0f / (invstdc[xi * len_c] * invstdc[xi * len_c]) +
+             meanc[xi * len_c] * meanc[xi * len_c] - eps;
+      kahan(temp * count_all[xi], ssum, c_ssum);
+    }
+    mean[ci] = sum / len_all;
+    invstd[ci] = 1.0f / sqrt(ssum / len_all - mean[ci] * mean[ci] + eps);
+    float unbiased_var =
+        (1.0f / (invstd[ci] * invstd[ci]) - eps) * len_all / (len_all - 1);
+    if (moving_mean != nullptr && moving_var != nullptr && output_num == 4) {
+      m_mean[ci] = momentum * mean[ci] + (1 - momentum) * moving_mean[ci];
+      m_var[ci] = momentum * unbiased_var + (1 - momentum) * moving_var[ci];
+    }
+  }
+}
+
+void SyncBatchnormGatherStatsWithCountsExecutor::cpuCompute() {
+  float eps = parser_->getProtoNode()
+                  ->sync_batchnorm_gather_stats_with_counts_param()
+                  .eps();
+  float momentum = parser_->getProtoNode()
+                       ->sync_batchnorm_gather_stats_with_counts_param()
+                       .momentum();
+
+  int idx_c = tensor_desc_[0].tensor->dim - 1;
+  int len_c = tensor_desc_[0].tensor->dims[idx_c];
+  int len_count_all = 1;
+  int len_mean_all = 1;
+  int len_invstd_all = 1;
+  if (parser_->getInputNum() == 4) {
+    len_count_all = tensor_desc_[3].tensor->dims[0];
+  } else if (parser_->getInputNum() == 6) {
+    len_count_all = tensor_desc_[5].tensor->dims[0];
+  }
+  for (int i = 0; i < tensor_desc_[1].tensor->dim; ++i) {
+    len_mean_all *= tensor_desc_[1].tensor->dims[i];
+  }
+  for (int i = 0; i < tensor_desc_[2].tensor->dim; ++i) {
+    len_invstd_all *= tensor_desc_[2].tensor->dims[i];
+  }
+  if (len_mean_all == 0 || len_c == 0 || len_count_all == 0 ||
+      len_mean_all != len_invstd_all) {
+    return;
+  }
+  int output_num = parser_->getOutputNum();
+  VLOG(4) << "Start to run cpuBatchNormForwardTraining().";
+  if (parser_->getInputNum() == 4) {
+    cpuBatchNormForwardTraining(
+        cpu_fp32_input_[1], cpu_fp32_input_[2], nullptr, nullptr, momentum, eps,
+        cpu_fp32_input_[3], nullptr, nullptr, cpu_fp32_output_[0],
+        cpu_fp32_output_[1], len_mean_all, len_c, output_num);
+  } else if (parser_->getInputNum() == 6) {
+    if (parser_->getOutputNum() == 2) {
+      cpuBatchNormForwardTraining(
+          cpu_fp32_input_[1], cpu_fp32_input_[2], cpu_fp32_input_[3],
+          cpu_fp32_input_[4], momentum, eps, cpu_fp32_input_[5], nullptr,
+          nullptr, cpu_fp32_output_[0], cpu_fp32_output_[1], len_mean_all,
+          len_c, output_num);
+    } else {
+      cpuBatchNormForwardTraining(
+          cpu_fp32_input_[1], cpu_fp32_input_[2], cpu_fp32_input_[3],
+          cpu_fp32_input_[4], momentum, eps, cpu_fp32_input_[5],
+          cpu_fp32_output_[0], cpu_fp32_output_[1], cpu_fp32_output_[2],
+          cpu_fp32_output_[3], len_mean_all, len_c, output_num);
+    }
+  }
+}
+
+int64_t SyncBatchnormGatherStatsWithCountsExecutor::getTheoryOps() {
+  int cp_count = 8;
+  int64_t theory_ops = parser_->getOutputDataCount(0) * cp_count;
+  VLOG(4) << "getTheoryOps: " << theory_ops << " ops";
+  return theory_ops;
+}
+
+std::set<Evaluator::Formula>
+SyncBatchnormGatherStatsWithCountsExecutor::getCriterionsUse() const {
+  return {Evaluator::DIFF1, Evaluator::DIFF2, Evaluator::DIFF3};
+}
+
+}  // namespace mluoptest
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.h
new file mode 100644
index 000000000..7dfa7f001
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_\
+SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_H_
+#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_\
+SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_H_
+#include <set>
+#include <vector>
+
+#include "executor.h"
+
+namespace mluoptest {
+
+class SyncBatchnormGatherStatsWithCountsExecutor : public Executor {
+ public:
+  SyncBatchnormGatherStatsWithCountsExecutor() {}
+  ~SyncBatchnormGatherStatsWithCountsExecutor() {}
+
+  void paramCheck();
+  void compute();
+  void cpuCompute();
+  int64_t getTheoryOps() override;
+  std::set<Evaluator::Formula> getCriterionsUse() const override;
+};
+
+}  // namespace mluoptest
+
+#endif  // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS\
+_SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS_H_
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_0.prototxt
new file mode 100644
index 000000000..de057a5b2
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_0.prototxt
@@ -0,0 +1,118 @@
+op_name: "sync_batchnorm_gather_stats_with_counts"
+op_type: "SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS"
+input {
+  id: "input"
+  shape: {
+    dims: 8
+    dims: 8
+    dims: 8
+    dims: 2048
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 32
+    upper_bound: 50
+    lower_bound: -50
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mean_all"
+  shape: {
+    dims: 8
+    dims: 2048
+  }
+  layout: LAYOUT_NC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 32
+    upper_bound: 50
+    lower_bound: -50
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "invstd_all"
+  shape: {
+    dims: 8
+    dims: 2048
+  }
+  layout: LAYOUT_NC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 33
+    upper_bound: 100
+    lower_bound: -100
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "moving_mean"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 35
+    upper_bound: 0.5
+    lower_bound: -0.5
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "moving_var"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 36
+    upper_bound: 0.5
+    lower_bound: 0.001
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "count_all"
+  shape: {
+    dims: 8
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 34
+    upper_bound: 50
+    lower_bound: 50
+    distribution: UNIFORM
+  }
+}
+output {
+  id: "mean"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+output {
+  id: "invstd"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+sync_batchnorm_gather_stats_with_counts_param: {
+  eps: 0.00001
+  momentum: 0.1
+}
+test_param: {
+  error_func: DIFF1
+  error_func: DIFF2
+  error_threshold: 0.003
+  error_threshold: 0.003
+  baseline_device: CPU
+}
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_1.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_1.prototxt
new file mode 100644
index 000000000..0b389c8b4
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_1.prototxt
@@ -0,0 +1,134 @@
+op_name: "sync_batchnorm_gather_stats_with_counts"
+op_type: "SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS"
+input {
+  id: "input"
+  shape: {
+    dims: 8
+    dims: 8
+    dims: 8
+    dims: 2048
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 32
+    upper_bound: 50
+    lower_bound: -50
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mean_all"
+  shape: {
+    dims: 8
+    dims: 2048
+  }
+  layout: LAYOUT_NC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 32
+    upper_bound: 50
+    lower_bound: -50
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "invstd_all"
+  shape: {
+    dims: 8
+    dims: 2048
+  }
+  layout: LAYOUT_NC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 33
+    upper_bound: 100
+    lower_bound: -100
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "moving_mean"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 35
+    upper_bound: 0.5
+    lower_bound: -0.5
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "moving_var"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 36
+    upper_bound: 0.5
+    lower_bound: 0.001
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "count_all"
+  shape: {
+    dims: 8
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 34
+    upper_bound: 50
+    lower_bound: 50
+    distribution: UNIFORM
+  }
+}
+output {
+  id: "m_mean"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+output {
+  id: "m_var"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+output {
+  id: "mean"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+output {
+  id: "invstd"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+sync_batchnorm_gather_stats_with_counts_param: {
+  eps: 0.00001
+  momentum: 0.1
+}
+test_param: {
+  error_func: DIFF1
+  error_func: DIFF2
+  error_threshold: 0.003
+  error_threshold: 0.003
+  baseline_device: CPU
+}
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_2.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_2.prototxt
new file mode 100644
index 000000000..3758739ca
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_gather_stats_with_counts/test_case/case_2.prototxt
@@ -0,0 +1,90 @@
+op_name: "sync_batchnorm_gather_stats_with_counts"
+op_type: "SYNC_BATCHNORM_GATHER_STATS_WITH_COUNTS"
+input {
+  id: "input"
+  shape: {
+    dims: 8
+    dims: 8
+    dims: 8
+    dims: 2048
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 32
+    upper_bound: 50
+    lower_bound: -50
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "mean_all"
+  shape: {
+    dims: 8
+    dims: 2048
+  }
+  layout: LAYOUT_NC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 32
+    upper_bound: 50
+    lower_bound: -50
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "invstd_all"
+  shape: {
+    dims: 8
+    dims: 2048
+  }
+  layout: LAYOUT_NC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 33
+    upper_bound: 100
+    lower_bound: -100
+    distribution: UNIFORM
+  }
+}
+input {
+  id: "count_all"
+  shape: {
+    dims: 8
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 34
+    upper_bound: 50
+    lower_bound: 50
+    distribution: UNIFORM
+  }
+}
+output {
+  id: "mean"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+output {
+  id: "invstd"
+  shape: {
+    dims: 2048
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+sync_batchnorm_gather_stats_with_counts_param: {
+  eps: 0.00001
+  momentum: 0.1
+}
+test_param: {
+  error_func: DIFF1
+  error_func: DIFF2
+  error_threshold: 0.003
+  error_threshold: 0.003
+  baseline_device: CPU
+}
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp
new file mode 100644
index 000000000..e32d9d211
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.cpp
@@ -0,0 +1,141 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "sync_batchnorm_stats.h"
+
+namespace mluoptest {
+
+void SyncBatchnormStatsExecutor::paramCheck() {
+  GTEST_CHECK(parser_->getProtoNode()
+              ->has_sync_batchnorm_stats_param(),
+              "Lose sync_batchnorm_stats param.");
+}
+
+void SyncBatchnormStatsExecutor::workspaceMalloc() {
+  auto tensor_x = tensor_desc_[0].tensor;
+  void *tmp = nullptr;
+  // allocate extra nram space for deletion of CDMA
+  MLUOP_CHECK(mluOpGetSyncBatchNormStatsWorkspaceSize(handle_, tensor_x,
+                                                      &workspace_size_));
+  if (workspace_size_ > 0) {
+    VLOG(4) << "Malloc workspace space for deletion of CDMA.";
+    tmp = mlu_runtime_.allocate(workspace_size_);
+    VLOG(4) << "Mallocated addr: " << tmp << ", size: " << workspace_size_;
+  } else {
+    VLOG(4) << "Don't need to Malloc workspace space.";
+  }
+  workspace_.push_back(tmp);
+  eva_->setMluWorkspaceSize(workspace_size_);
+}
+
+void SyncBatchnormStatsExecutor::workspaceFree() {
+  if (workspace_[0]) {
+    VLOG(4) << "Free device workspace space.";
+    mlu_runtime_.deallocate(workspace_[0]);
+  }
+}
+
+void SyncBatchnormStatsExecutor::compute() {
+  float eps = parser_->getProtoNode()->sync_batchnorm_stats_param().eps();
+
+  mluOpTensorDescriptor_t x_desc = tensor_desc_[0].tensor;
+  mluOpTensorDescriptor_t mean_desc = tensor_desc_[1].tensor;
+  mluOpTensorDescriptor_t invstd_desc = tensor_desc_[2].tensor;
+
+  VLOG(4) << "call mluOpSyncBatchNormStats()";
+  interface_timer_.start();
+#if 1
+  VLOG(4) << "launch mluOpSyncBatchNormStats_v2.";
+  MLUOP_CHECK(mluOpSyncBatchNormStats_v2(
+      handle_, x_desc, data_vector_[0].device_ptr, workspace_[0],
+      workspace_size_, eps, mean_desc, data_vector_[1].device_ptr, invstd_desc,
+      data_vector_[2].device_ptr));
+#else
+  VLOG(4) << "launch mluOpSyncBatchNormStats.";
+  MLUOP_CHECK(mluOpSyncBatchNormStats(
+      handle_, x_desc, data_vector_[0].device_ptr, eps, mean_desc,
+      data_vector_[1].device_ptr, invstd_desc, data_vector_[2].device_ptr));
+#endif
+  interface_timer_.stop();
+}
+
+void kahan_stats(float input, float &sum, float &delta) {
+  float y = input - delta;
+  float t = sum + y;
+  delta = t - sum - y;
+  sum = t;
+}
+
+void cpuSyncBatchNormStats(const float *x, const float eps, float *mean,
+                           float *invstd, const int len_x, const int len_c) {
+  float len_nhw = len_x / len_c;
+
+  bool flag_free = false;
+  if (mean == nullptr && invstd == nullptr) {
+    mean = new float[len_c];
+    invstd = new float[len_c];
+    flag_free = true;
+  }
+
+  for (int ci = 0; ci < len_c; ++ci) {
+    float sum = 0, ssum = 0;
+    float c_sum = 0.0, c_ssum = 0.0;
+    const float *xc = x + ci;
+    for (int xi = 0; xi < len_nhw; ++xi) {
+      kahan_stats(xc[xi * len_c], sum, c_sum);
+      kahan_stats(xc[xi * len_c] * xc[xi * len_c], ssum, c_ssum);
+    }
+    mean[ci] = sum / len_nhw;
+    invstd[ci] = 1.0f / sqrt(ssum / len_nhw - (mean[ci] * mean[ci]) + eps);
+  }
+
+  if (flag_free == true) {
+    delete[] mean;
+    delete[] invstd;
+  }
+}
+
+void SyncBatchnormStatsExecutor::cpuCompute() {
+  float eps = parser_->getProtoNode()->sync_batchnorm_stats_param().eps();
+
+  int idx_c = tensor_desc_[0].tensor->dim - 1;
+  int len_c = tensor_desc_[0].tensor->dims[idx_c];
+  int len_x = 1;
+  for (int i = 0; i < tensor_desc_[0].tensor->dim; ++i) {
+    len_x *= tensor_desc_[0].tensor->dims[i];
+  }
+  if (len_x == 0 || len_c == 0) {
+    return;
+  }
+  VLOG(4) << "Start to run cpuSyncBatchNormStats().";
+  cpuSyncBatchNormStats(cpu_fp32_input_[0], eps, cpu_fp32_output_[0],
+                        cpu_fp32_output_[1], len_x, len_c);
+}
+
+int64_t SyncBatchnormStatsExecutor::getTheoryOps() {
+  int cp_count = 8;
+  int64_t theory_ops = parser_->getOutputDataCount(0) * cp_count;
+  VLOG(4) << "getTheoryOps: " << theory_ops << " ops";
+  return theory_ops;
+}
+
+}  // namespace mluoptest
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.h b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.h
new file mode 100644
index 000000000..57b02bec9
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/sync_batchnorm_stats.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (C) [2023] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#ifndef TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_STATS_SYNC_BATCHNORM_STATS_H_
+#define TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_STATS_SYNC_BATCHNORM_STATS_H_
+
+#include "executor.h"
+
+namespace mluoptest {
+
+class SyncBatchnormStatsExecutor : public Executor {
+ public:
+  SyncBatchnormStatsExecutor() {}
+  ~SyncBatchnormStatsExecutor() {}
+
+  void paramCheck();
+  void workspaceMalloc();
+  void workspaceFree();
+  void compute();
+  void cpuCompute();
+  int64_t getTheoryOps() override;
+
+ private:
+  size_t workspace_size_ = 0;
+};
+
+}  // namespace mluoptest
+
+#endif  // TEST_MLU_OP_GTEST_SRC_ZOO_SYNC_BATCHNORM_STATS_SYNC_\
+BATCHNORM_STATS_H_
diff --git a/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/test_case/case_0.prototxt b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/test_case/case_0.prototxt
new file mode 100644
index 000000000..1b346a3ac
--- /dev/null
+++ b/bangc-ops/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_stats/test_case/case_0.prototxt
@@ -0,0 +1,45 @@
+op_name: "sync_batchnorm_stats"
+op_type: "SYNC_BATCHNORM_STATS"
+input {
+  id: "x"
+  shape: {
+    dims: 4
+    dims: 35
+    dims: 35
+    dims: 960
+  }
+  layout: LAYOUT_NHWC
+  dtype: DTYPE_FLOAT
+  random_data: {
+    seed: 32
+    upper_bound: 2.83
+    lower_bound: -0.5
+    distribution: UNIFORM
+  }
+}
+output {
+  id: "mean"
+  shape: {
+    dims: 960
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+output {
+  id: "invstd"
+  shape: {
+    dims: 960
+  }
+  layout: LAYOUT_ARRAY
+  dtype: DTYPE_FLOAT
+}
+sync_batchnorm_stats_param: {
+  eps: 0.00001
+}
+test_param: {
+  error_func: DIFF1
+  error_func: DIFF2
+  error_threshold: 0.003
+  error_threshold: 0.003
+  baseline_device: CPU
+}
diff --git a/docs/bangc-docs/user_guide/9_operators/index.rst b/docs/bangc-docs/user_guide/9_operators/index.rst
index 0788171c9..9383f6dc8 100644
--- a/docs/bangc-docs/user_guide/9_operators/index.rst
+++ b/docs/bangc-docs/user_guide/9_operators/index.rst
@@ -920,3 +920,33 @@ mluOpDynamicPointToVoxelForward
 1）将体素坐标 `coors` 进行排序、去重，得到新的体素坐标 `voxel_coors`; 保存去重后体素的个数 ``num_voxels`` 到 `voxel_num`; 保存 `coors` 中每个体素坐标在 `voxel_coors` 中对应的索引到 `point2voxel_map`; 保存 `voxel_coors` 中每个体素坐标在 `coors` 中出现的个数到 `voxel_points_count`;
 
 2）遍历 `feats` 中每个点，在特征维度上，对每个值根据 `reduce_type` 的方法进行计算，将结果保存到 `voxel_feats` 中; 当 `reduce_type` = ``max``, 在特征维度上对每个值取最大的值; 当 `reduce_type` = ``mean``, 将特征维度每个值都累加到 `voxel_feats` 对应位置中，再利用 `voxel_points_count` 获取该体素位置在原始体素中出现的个数，再对 `voxel_feats` 的特征维度求平均。
+
+.. _sync_batchnorm_stats:
+
+mluOpSyncBatchNormStats
+-------------------------
+该算子用来计算单卡上SyncBatchNorm的均值和标准差的倒数。
+
+.. _sync_batchnorm_gather_stats_with_counts:
+
+mluOpSyncBatchNormGatherStatsWithCounts
+-----------------------------------------
+该算子用来计算SyncBatchNorm的全局均值和标准差的倒数。
+
+.. _sync_batchnorm_elemt:
+
+mluOpSyncBatchNormElemt
+-------------------------
+该算子用来计算SyncBatchNorm的前向输出。
+
+.. _sync_batchnorm_backward_reduce:
+
+mluOpSyncBatchnormBackwardReduce
+----------------------------------
+该算子用来计算损失函数想对于weight和bias的梯度，以及根据开关情况决定是否输出下级element函数的中间变量 ``sum_dy`` 和 ``sum_dy_xmu``。本算子通过多卡通信的方式，解决sync_batchnorm_backward在单卡上batch size数据过大导致训练时间较长的问题。
+
+.. _sync_batch_norm_backward_elemt:
+
+mluOpSyncBatchNormBackwardElemt
+---------------------------------
+该算子用来计算输入的梯度，与 :ref:`sync_batchnorm_backward_reduce` 共同实现了sync_batchnorm_backward。