diff --git a/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
index 9f5068212..4abfb0ab2 100644
--- a/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
+++ b/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
@@ -22,11 +22,13 @@
  *************************************************************************/
 #include "kernels/utils/cnnl_helper.h"
 
-mluOpStatus_t MLUOP_WIN_API mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(
+mluOpStatus_t MLUOP_WIN_API mluOpGetSyncBatchNormBackwardReduceWorkspaceSize(
     mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_x,
     size_t *workspace_size) {
-  PARAM_CHECK("mluOpSyncBatchnormBackwardReduce_v2", handle != NULL);
-  PARAM_CHECK("mluOpSyncBatchnormBackwardReduce_v2", desc_x != NULL);
+  PARAM_CHECK("mluOpGetSyncBatchNormBackwardReduceWorkspaceSize",
+              handle != NULL);
+  PARAM_CHECK("mluOpGetSyncBatchNormBackwardReduceWorkspaceSize",
+              desc_x != NULL);
 
   DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
   DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_x, cnnl_desc_x);
@@ -35,8 +37,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(
       cnnlGetSyncBatchnormBackwardReduceWorkspaceSize(cnnl_handle, cnnl_desc_x,
                                                       workspace_size),
       CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchnormBackwardReduce_v2] Internal error"
-      " accured in mluOpGetSyncBatchnormBackwardReduceWorkspaceSize.",
+      "[mluOpGetSyncBatchNormBackwardReduceWorkspaceSize] Internal error"
+      " accured in cnnlGetSyncBatchnormBackwardReduceWorkspaceSize.",
       MLUOP_STATUS_INTERNAL_ERROR);
 
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_x);
@@ -44,7 +46,18 @@ mluOpStatus_t MLUOP_WIN_API mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(
   return MLUOP_STATUS_SUCCESS;
 }
 
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce(
+mluOpStatus_t MLUOP_WIN_API mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(
+    mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_x,
+    size_t *workspace_size) {
+  LOG_FIRST_N(WARNING, 1)
+      << "[mluOpGetSyncBatchnormBackwardReduceWorkspaceSize] is deprecated and"
+      << " will be removed in the future release, please use "
+      << "[mluOpGetSyncBatchNormBackwardReduceWorkspaceSize] instead.";
+  return mluOpGetSyncBatchNormBackwardReduceWorkspaceSize(
+              handle, desc_x, workspace_size);
+}
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardReduce(
     mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz,
     const mluOpTensorDescriptor_t desc_x, const void *x,
     const mluOpTensorDescriptor_t desc_mean, const void *mean,
@@ -55,15 +68,15 @@ mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce(
     const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu,
     const bool needs_input_grad0, const bool needs_input_grad1,
     const bool needs_input_grad2) {
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_dz != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_invstd != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", dz != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", invstd != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce]", handle != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce]", desc_dz != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce]", desc_x != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce]", desc_mean != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce]", desc_invstd != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce]", dz != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce]", x != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce]", mean != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce]", invstd != NULL);
 
   DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
   DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dz, cnnl_desc_dz);
@@ -83,8 +96,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce(
           dbias, cnnl_desc_sum_dy, sum_dy, cnnl_desc_sum_dy_xmu, sum_dy_xmu,
           needs_input_grad0, needs_input_grad1, needs_input_grad2),
       CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchnormBackwardReduce] Internal error"
-      " accured in mluOpSyncBatchnormBackwardReduce.",
+      "[mluOpSyncBatchNormBackwardReduce] Internal error"
+      " accured in cnnlSyncBatchnormBackwardReduce.",
       MLUOP_STATUS_INTERNAL_ERROR);
 
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dz);
@@ -99,7 +112,30 @@ mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce(
   return MLUOP_STATUS_SUCCESS;
 }
 
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce_v2(
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce(
+    mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz,
+    const mluOpTensorDescriptor_t desc_x, const void *x,
+    const mluOpTensorDescriptor_t desc_mean, const void *mean,
+    const mluOpTensorDescriptor_t desc_invstd, const void *invstd,
+    const mluOpTensorDescriptor_t desc_dfilter, void *dfilter,
+    const mluOpTensorDescriptor_t desc_dbias, void *dbias,
+    const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy,
+    const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu,
+    const bool needs_input_grad0, const bool needs_input_grad1,
+    const bool needs_input_grad2) {
+  LOG_FIRST_N(WARNING, 1)
+      << "[mluOpSyncBatchnormBackwardReduce] is deprecated and"
+      << " will be removed in the future release, please use "
+      << "[mluOpSyncBatchNormBackwardReduce] instead.";
+  return mluOpSyncBatchNormBackwardReduce(
+              handle, desc_dz, dz, desc_x, x, desc_mean, mean,
+              desc_invstd, invstd, desc_dfilter, dfilter,
+              desc_dbias, dbias, desc_sum_dy, sum_dy,
+              desc_sum_dy_xmu, sum_dy_xmu,
+              needs_input_grad0, needs_input_grad1, needs_input_grad2);
+}
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardReduce_v2(
     mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz,
     const mluOpTensorDescriptor_t desc_x, const void *x,
     const mluOpTensorDescriptor_t desc_mean, const void *mean,
@@ -111,17 +147,17 @@ mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce_v2(
     const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu,
     const bool needs_input_grad0, const bool needs_input_grad1,
     const bool needs_input_grad2) {
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_dz != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_invstd != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", dz != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", invstd != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce_v2]", handle != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce_v2]", desc_dz != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce_v2]", desc_x != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce_v2]", desc_mean != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce_v2]", desc_invstd != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce_v2]", dz != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce_v2]", x != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce_v2]", mean != NULL);
+  PARAM_CHECK("[mluOpSyncBatchNormBackwardReduce_v2]", invstd != NULL);
   if (workspace_size > 0) {
-    PARAM_CHECK("mluOpSyncBatchnormBackwardReduce_v2", workspace != NULL);
+    PARAM_CHECK("mluOpSyncBatchNormBackwardReduce_v2", workspace != NULL);
   }
 
   DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
@@ -143,8 +179,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce_v2(
           sum_dy, cnnl_desc_sum_dy_xmu, sum_dy_xmu, needs_input_grad0,
           needs_input_grad1, needs_input_grad2),
       CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchnormBackwardReduce] Internal error"
-      " accured in mluOpSyncBatchnormBackwardReduce_v2.",
+      "[mluOpSyncBatchNormBackwardReduce_v2] Internal error"
+      " accured in cnnlSyncBatchnormBackwardReduce_v2.",
       MLUOP_STATUS_INTERNAL_ERROR);
 
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dz);
@@ -158,3 +194,27 @@ mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce_v2(
   DESTROY_CNNL_HANDLE(cnnl_handle);
   return MLUOP_STATUS_SUCCESS;
 }
+
+mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce_v2(
+    mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz,
+    const mluOpTensorDescriptor_t desc_x, const void *x,
+    const mluOpTensorDescriptor_t desc_mean, const void *mean,
+    const mluOpTensorDescriptor_t desc_invstd, const void *invstd,
+    void *workspace, size_t workspace_size,
+    const mluOpTensorDescriptor_t desc_dfilter, void *dfilter,
+    const mluOpTensorDescriptor_t desc_dbias, void *dbias,
+    const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy,
+    const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu,
+    const bool needs_input_grad0, const bool needs_input_grad1,
+    const bool needs_input_grad2) {
+  LOG_FIRST_N(WARNING, 1)
+      << "[mluOpSyncBatchnormBackwardReduce_v2] is deprecated and"
+      << " will be removed in the future release, please use "
+      << "[mluOpSyncBatchNormBackwardReduce_v2] instead.";
+  return mluOpSyncBatchNormBackwardReduce_v2(
+              handle, desc_dz, dz, desc_x, x, desc_mean, mean,
+              desc_invstd, invstd, workspace, workspace_size,
+              desc_dfilter, dfilter, desc_dbias, dbias,
+              desc_sum_dy, sum_dy, desc_sum_dy_xmu, sum_dy_xmu,
+              needs_input_grad0, needs_input_grad1, needs_input_grad2);
+}
diff --git a/mlu_op.h b/mlu_op.h
index ed7d0d847..3f61b45ed 100644
--- a/mlu_op.h
+++ b/mlu_op.h
@@ -7737,7 +7737,7 @@ mluOpMutualInformationForward(mluOpHandle_t handle,
  * including the input tensor descriptors \b pts_desc.
  *
  * @par Deprecated
- * - :: mluOpGetRoiawarePool3dForwardWorkspaceSize is deprecated and will be removed in the future
+ * - ::mluOpGetRoiawarePool3dForwardWorkspaceSize is deprecated and will be removed in the future
  *   release. It is recommended to use ::mluOpGetRoiAwarePool3dForwardWorkspaceSize instead.
  *
  * @param[in] handle
@@ -7846,6 +7846,10 @@ mluOpGetRoiAwarePool3dForwardWorkspaceSize(mluOpHandle_t handle,
  * also performs max pooling or average pooling on the voxels and results in \b argmax
  * and \b pooled_features.
  *
+ * @par Deprecated
+ * - ::mluOpRoiawarePool3dForward is deprecated and will be removed in the future
+ *   release. It is recommended to use ::mluOpRoiAwarePool3dForward instead.
+ *
  * @param[in] handle
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in
  * ::mluOpRoiAwarePool3dForward operation. For detailed information, see ::mluOpHandle_t.
@@ -8104,7 +8108,7 @@ mluOpRoiAwarePool3dForward(mluOpHandle_t handle,
  * performing the backpropagation of ::mluOpRoiAwarePool3dForward.
  *
  * @par Deprecated
- * - :: mluOpRoiawarePool3dBackward is deprecated and will be removed in the future
+ * - ::mluOpRoiawarePool3dBackward is deprecated and will be removed in the future
  *   release. It is recommended to use ::mluOpRoiAwarePool3dBackward instead.
  *
  * @param[in] handle
@@ -11872,13 +11876,66 @@ mluOpSyncBatchNormElemt(mluOpHandle_t handle,
                         const mluOpTensorDescriptor_t y_desc,
                         void *y);
 
-// Group: SyncBatchnormBackwardReduce
+// Group: SyncBatchNormBackwardReduce
+/*!
+ * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra
+ * workspace to optimize the sync_batchnorm_backward_reduce operation.
+ *
+ * The size of extra workspace is based on the given information of
+ * ::mluOpSyncBatchNormBackwardReduce_v2 operation, including the input tensor descriptor \b x_desc.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the mse_loss
+ * operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] x_desc
+ * The descriptor of the input tensor. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] workspace_size
+ * Pointer to the returned size of the extra workspace in bytes that is used in
+ * ::mluOpSyncBatchNormBackwardReduce_v2 operation.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - None.
+ *
+ * @par Data Layout
+ * - None.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par note
+ * - This API is only used along with ::mluOpSyncBatchNormBackwardReduce_v2.
+ * - ::mluOpSyncBatchNormBackwardReduce does not require this API.
+ *
+ * @par Example
+ * - None.
+ *
+ * @par Reference
+ * - None.
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpGetSyncBatchNormBackwardReduceWorkspaceSize(mluOpHandle_t handle,
+                                                 const mluOpTensorDescriptor_t x_desc,
+                                                 size_t *workspace_size);
+
+// Group: Deprecated APIs
 /*!
  * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra
  * workspace to optimize the sync_batchnorm_backward_reduce operation.
  *
  * The size of extra workspace is based on the given information of
- * ::mluOpSyncBatchnormBackwardReduce_v2 operation, including the input tensor descriptor \b x_desc.
+ * ::mluOpSyncBatchNormBackwardReduce_v2 operation, including the input tensor descriptor \b x_desc.
+ *
+ * @par Deprecated
+ * - ::mluOpGetSyncBatchnormBackwardReduceWorkspaceSize is deprecated and will be
+ *   removed in the future release. It is recommended to use
+ *   ::mluOpGetSyncBatchNormBackwardReduceWorkspaceSize instead.
  *
  * @param[in] handle
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the mse_loss
@@ -11888,7 +11945,7 @@ mluOpSyncBatchNormElemt(mluOpHandle_t handle,
  * ::mluOpTensorDescriptor_t.
  * @param[out] workspace_size
  * Pointer to the returned size of the extra workspace in bytes that is used in
- * ::mluOpSyncBatchnormBackwardReduce_v2 operation.
+ * ::mluOpSyncBatchNormBackwardReduce_v2 operation.
  *
  * @par Return
  * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM
@@ -11906,8 +11963,8 @@ mluOpSyncBatchNormElemt(mluOpHandle_t handle,
  * - None.
  *
  * @par note
- * - This API is only used along with ::mluOpSyncBatchnormBackwardReduce_v2.
- * - ::mluOpSyncBatchnormBackwardReduce does not require this API.
+ * - This API is only used along with ::mluOpSyncBatchNormBackwardReduce_v2.
+ * - ::mluOpSyncBatchNormBackwardReduce does not require this API.
  *
  * @par Example
  * - None.
@@ -11920,7 +11977,7 @@ mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(mluOpHandle_t handle,
                                                  const mluOpTensorDescriptor_t x_desc,
                                                  size_t *workspace_size);
 
-// Group: SyncBatchnormBackwardReduce
+// Group: SyncBatchNormBackwardReduce
 /*!
  * @brief Applies Synchronized Batch Normalization Reduce operator to backwardly compute grad
  * filters, grad bias, sum_dy and sum_dy_xmu on each MLU device.
@@ -11928,13 +11985,13 @@ mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(mluOpHandle_t handle,
  * Batch Normalization is used in convolution network, including but not limited to
  * ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
  *
- * Compared with ::mluOpSyncBatchnormBackwardReduce, this function allows you to allocate some extra
+ * Compared with ::mluOpSyncBatchNormBackwardReduce, this function allows you to allocate some extra
  * workspace as an input parameter. If you just set \b workspace to NULL and \b workspace_size to 0,
- * this function will perform as same as ::mluOpSyncBatchnormBackwardReduce.
+ * this function will perform as same as ::mluOpSyncBatchNormBackwardReduce.
  *
  * @param[in] handle
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in
- * ::mluOpSyncBatchnormBackwardReduce_v2 operation. For detailed information, see ::mluOpHandle_t.
+ * ::mluOpSyncBatchNormBackwardReduce_v2 operation. For detailed information, see ::mluOpHandle_t.
  * @param[in] desc_dz
  * The descriptor of the input tensor \b dz. For detailed information, see
  * ::mluOpTensorDescriptor_t.
@@ -11958,11 +12015,11 @@ mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(mluOpHandle_t handle,
  * standard deviation of input \b x.
  * @param[in] workspace
  * Pointer to the MLU memory that is used as an extra workspace for
- * ::mluOpSyncBatchnormBackwardReduce_v2.
+ * ::mluOpSyncBatchNormBackwardReduce_v2.
  * @param[in] workspace_size
  * The size of the extra workspace in bytes that needs to be used in
- * the ::mluOpSyncBatchnormBackwardReduce_v2. You can get the size of the workspace with
- * the ::mluOpGetSyncBatchnormBackwardReduceWorkspaceSize function.
+ * the ::mluOpSyncBatchNormBackwardReduce_v2. You can get the size of the workspace with
+ * the ::mluOpGetSyncBatchNormBackwardReduceWorkspaceSize function.
  * @param[out] desc_dfilter
  * The descriptor of \b dfilters tensor. For detailed information, see ::mluOpTensorDescriptor_t.
  * @param[out] dfilter
@@ -12030,8 +12087,8 @@ mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(mluOpHandle_t handle,
  * - None.
  *
  * @par API Dependency
- * - Before calling this function to perform ::mluOpSyncBatchnormBackwardReduce_v2, you need to get
- *   the size of workspace by ::mluOpGetSyncBatchnormBackwardReduceWorkspaceSize.
+ * - Before calling this function to perform ::mluOpSyncBatchNormBackwardReduce_v2, you need to get
+ *   the size of workspace by ::mluOpGetSyncBatchNormBackwardReduceWorkspaceSize.
  *
  * @par note
  * - The \b mean, \b invstd, \b dfilter, \b bias, \b sum_dy and \b sum_dy_xmu must be 1D tensors
@@ -12040,7 +12097,184 @@ mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(mluOpHandle_t handle,
  * - The length of each dimension of \b x and \b dz must be the same.
  *
  * @par Example
- * - The example of ::mluOpSyncBatchnormBackwardReduce_v2 operation is as follows:
+ * - The example of ::mluOpSyncBatchNormBackwardReduce_v2 operation is as follows:
+     @verbatim
+      input four arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2
+      --> dz: [[[[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]],
+               [[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]]]]
+
+      --> x: [[[[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]],
+               [[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]]]]
+
+      --> mean: [1, 1]
+
+      --> invstd: [0.8, 0.8]
+
+      output array by 2
+      --> dfilter: [57.6, 57.6]
+
+      --> dbias: [36.0, 36.0]
+
+      --> sum_dy: [36.0, 36.0]
+
+      --> sum_dy_xmu: [72.0, 72.0]
+     @endverbatim
+ *
+ * @par Reference
+ * - Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift,
+ *   Sergey Ioffe, 2015.
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchNormBackwardReduce_v2(mluOpHandle_t handle,
+                                    const mluOpTensorDescriptor_t desc_dz,
+                                    const void *dz,
+                                    const mluOpTensorDescriptor_t desc_x,
+                                    const void *x,
+                                    const mluOpTensorDescriptor_t desc_mean,
+                                    const void *mean,
+                                    const mluOpTensorDescriptor_t desc_invstd,
+                                    const void *invstd,
+                                    void *workspace,
+                                    size_t workspace_size,
+                                    const mluOpTensorDescriptor_t desc_dfilter,
+                                    void *dfilter,
+                                    const mluOpTensorDescriptor_t desc_dbias,
+                                    void *dbias,
+                                    const mluOpTensorDescriptor_t desc_sum_dy,
+                                    void *sum_dy,
+                                    const mluOpTensorDescriptor_t desc_sum_dy_xmu,
+                                    void *sum_dy_xmu,
+                                    const bool needs_input_grad0,
+                                    const bool needs_input_grad1,
+                                    const bool needs_input_grad2);
+
+// Group: Deprecated APIs
+/*!
+ * @brief Applies Synchronized Batch Normalization Reduce operator to backwardly compute grad
+ * filters, grad bias, sum_dy and sum_dy_xmu on each MLU device.
+ *
+ * Batch Normalization is used in convolution network, including but not limited to
+ * ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
+ *
+ * Compared with ::mluOpSyncBatchNormBackwardReduce, this function allows you to allocate some extra
+ * workspace as an input parameter. If you just set \b workspace to NULL and \b workspace_size to 0,
+ * this function will perform as same as ::mluOpSyncBatchNormBackwardReduce.
+ *
+ * @par Deprecated
+ * - ::mluOpSyncBatchnormBackwardReduce_v2 is deprecated and will be
+ *   removed in the future release. It is recommended to use
+ *   ::mluOpSyncBatchNormBackwardReduce_v2 instead.
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in
+ * ::mluOpSyncBatchNormBackwardReduce_v2 operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] desc_dz
+ * The descriptor of the input tensor \b dz. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] dz
+ * Pointer to the MLU memory that stores the tensor \b dz, which denotes the partial
+ * derivative of batch normalization forward output.
+ * @param[in] desc_x
+ * The descriptor of the input tensor \b x. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] x
+ * Pointer to the MLU memory that stores the input tensor \b x.
+ * @param[in] desc_mean
+ * The descriptor of \b mean tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] mean
+ * Pointer to the MLU memory that stores the tensor \b mean, which denotes the average
+ * result of input \b x.
+ * @param[in] desc_invstd
+ * The descriptor of \b invstd tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] invstd
+ * Pointer to the MLU memory that stores the tensor \b invstd, which denotes the inversed
+ * standard deviation of input \b x.
+ * @param[in] workspace
+ * Pointer to the MLU memory that is used as an extra workspace for
+ * ::mluOpSyncBatchNormBackwardReduce_v2.
+ * @param[in] workspace_size
+ * The size of the extra workspace in bytes that needs to be used in
+ * the ::mluOpSyncBatchNormBackwardReduce_v2. You can get the size of the workspace with
+ * the ::mluOpGetSyncBatchNormBackwardReduceWorkspaceSize function.
+ * @param[out] desc_dfilter
+ * The descriptor of \b dfilters tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[out] dfilter
+ * Pointer to the MLU memory that stores the input tensor \b dfilters, which denotes
+ * partial derivative of filter in sync batch normalization forward training. It will be computed
+ * only if booleanvariable \b needs_input_grad1 is true.
+ * @param[out] desc_dbias
+ * The descriptor of the sync batch normalization output tensor \b dbias. For detailed
+ * information, see ::mluOpTensorDescriptor_t.
+ * @param[out] dbias
+ * Pointer to the MLU memory that stores the output tensor \b dbias, which denotes partial
+ * derivative of bias in sync batch normalization forward training. It will be computed
+ * only if \b needs_input_grad2 is true.
+ * @param[out] desc_sum_dy
+ * The descriptor of the sync batch normalization output tensor \b sum_dy. For detailed
+ * information, see ::mluOpTensorDescriptor_t.
+ * @param[out] sum_dy
+ * Pointer to the MLU memory that stores the output tensor \b sum_dy, which denotes the
+ * summation of dz and is also an intermediate variable to compute the partial derivative of
+ * input x. Moreover, it will be computed only if boolean variable \b needs_input_grad0 is true.
+ * @param[out] desc_sum_dy_xmu
+ * The descriptor of the sync batch normalization output tensor \b sum_dy_xmu. For detailed
+ * information, see ::mluOpTensorDescriptor_t.
+ * @param[out] sum_dy_xmu
+ * Pointer to the MLU memory that stores the output tensor \b sum_dy_xmu, which denotes
+ * sum{dz(x-mean)}. It is also an intermediate variable to compute the partial derivative of
+ * input \b x. Moreover, it will be computed only if boolean variable \b needs_input_grad0 is
+ * true.
+ * @param[in] needs_input_grad0
+ * A boolean variable that determines whether to compute \b sum_dy and \b sum_dy_xmu.
+ * When \b needs_input_grad0 is true, \b sum_dy and \b sum_dy_xmu will be computed.
+ * When \b needs_input_grad0 is false, \b sum_dy and \b sum_dy_xmu will be NULL.
+ * @param[in] needs_input_grad1
+ * A boolean variable that determines whether to compute \b dfilters.
+ * When \b needs_input_grad1 is true, \b dfilters will be computed.
+ * When \b needs_input_grad1 is false, \b dfilter will be NULL.
+ * @param[in] needs_input_grad2
+ * A boolean variable that determines whether to compute \b dbias.
+ * When \b needs_input_grad2 is true, \b dbias will be computed.
+ * When \b needs_input_grad2 is false, \b dbias will be NULL.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown below with the following order:
+ *   - dz_tensor - x_tensor - mean_tensor - invstd_tensor - dfilter_tensor - dbias_tensor -
+ *   sum_dy_tensor - sum_dy_xmu_tensor
+ *   - float - float - float - float - float - float - float - float.
+ *   - half - half - float - float - float - float - float - float.
+ *
+ * @par Data Layout
+ * - The supported data layout of \b dz, \b x, \b mean, \b invstd, \b dfilter, \b dbias, \b sum_dy
+ *   and \b sum_dy_xmu is as follows:
+ *   - dz tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC.
+ *   - x tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC.
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - dfilter tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - dbias tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - Before calling this function to perform ::mluOpSyncBatchNormBackwardReduce_v2, you need to get
+ *   the size of workspace by ::mluOpGetSyncBatchNormBackwardReduceWorkspaceSize.
+ *
+ * @par note
+ * - The \b mean, \b invstd, \b dfilter, \b bias, \b sum_dy and \b sum_dy_xmu must be 1D tensors
+ *   and the length of the dimensions of these tensors should be the same as the length of
+ *   the lowest dimension of \b x.
+ * - The length of each dimension of \b x and \b dz must be the same.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchNormBackwardReduce_v2 operation is as follows:
      @verbatim
       input four arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2
       --> dz: [[[[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]],
@@ -12092,7 +12326,162 @@ mluOpSyncBatchnormBackwardReduce_v2(mluOpHandle_t handle,
                                     const bool needs_input_grad1,
                                     const bool needs_input_grad2);
 
-// Group: SyncBatchnormBackwardReduce
+// Group: SyncBatchNormBackwardReduce
+/*!
+ * @brief Applies Synchronized Batch Normalization Reduce operator to backwardly compute grad filters,
+ * grad bias, sum_dy and sum_dy_xmu on each MLU device.
+ *
+ * Batch Normalization is used in CNN, including but not limited to
+ * ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
+ *
+ * @param[in] handle
+ * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the
+ * ::mluOpSyncBatchNormBackwardReduce operation. For detailed information, see ::mluOpHandle_t.
+ * @param[in] desc_dz
+ * The descriptor of the input tensor \b dz. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] dz
+ * Pointer to the MLU memory that stores the tensor \b dz, which denotes the partial derivative of
+ * batch normalization forward output.
+ * @param[in] desc_x
+ * The descriptor of the input tensor \b x. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[in] x
+ * Pointer to the MLU memory that stores the input tensor \b x.
+ * @param[in] desc_mean
+ * The descriptor of \b mean tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] mean
+ * Pointer to the MLU memory that stores the tensor \b mean, which denotes the average result of
+ * input \b x.
+ * @param[in] desc_invstd
+ * The descriptor of \b invstd tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[in] invstd
+ * Pointer to the MLU memory that stores the tensor \b invstd, which denotes the inversed standard deviation
+ * of input \b x.
+ * @param[out] desc_dfilter
+ * The descriptor of \b dfilter tensor. For detailed information, see ::mluOpTensorDescriptor_t.
+ * @param[out] dfilter
+ * Pointer to the MLU memory that stores the input tensor \b dfilter, which denotes partial derivative
+ * of filter in sync batch normalization forward training. It will be computed only if boolean variable
+ * \b needs_input_grad1 is true.
+ * @param[out] desc_dbias
+ * The descriptor of the sync batch normalization output tensor \b dbias. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] dbias
+ * Pointer to the MLU memory that stores the output tensor \b dbias, which denotes partial derivative of
+ * bias in sync batch normalization forward training. It will be computed only if \b needs_input_grad2 is true.
+ * @param[out] desc_sum_dy
+ * The descriptor of the sync batch normalization output tensor \b sum_dy. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] sum_dy
+ * Pointer to the MLU memory that stores the output tensor \b sum_dy, which denotes the summation of dz
+ * and is also an intermediate variable to compute the partial derivative of input x. Moreover, it will be
+ * computed only if boolean variable \b needs_input_grad0 is true.
+ * @param[out] desc_sum_dy_xmu
+ * The descriptor of the sync batch normalization output tensor \b sum_dy_xmu. For detailed information, see
+ * ::mluOpTensorDescriptor_t.
+ * @param[out] sum_dy_xmu
+ * Pointer to the MLU memory that stores the output tensor \b sum_dy_xmu, which denotes sum{dz(x-mean)}.
+ * It is also an intermediate variable to compute the partial derivative of
+ * input \b x. Moreover, it will be computed only if boolean variable \b needs_input_grad0 is true.
+ * @param[in] needs_input_grad0
+ * A boolean variable that determines whether to compute \b sum_dy and \b sum_dy_xmu.
+ * When \b needs_input_grad0 is true, \b sum_dy and \b sum_dy_xmu will be computed.
+ * When \b needs_input_grad0 is false, \b sum_dy and \b sum_dy_xmu will be NULL.
+ * @param[in] needs_input_grad1
+ * A boolean variable that determines whether to compute \b dfilters.
+ * When \b needs_input_grad1 is true, \b dfilters will be computed.
+ * When \b needs_input_grad1 is false, \b dfilter will be NULL.
+ * @param[in] needs_input_grad2
+ * A boolean variable that determines whether to compute \b dbias.
+ * When \b needs_input_grad2 is true, \b dbias will be computed.
+ * When \b needs_input_grad2 is false, \b dbias will be NULL.
+ *
+ * @par Return
+ * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_ARCH_MISMATCH, ::MLUOP_STATUS_BAD_PARAM
+ *
+ * @par Data Type
+ * - The supported combinations of data types are shown below with the following order:
+ *   - dz_tensor - x_tensor - mean_tensor - invstd_tensor - dfilter_tensor - dbias_tensor - sum_dy_tensor
+ *   - sum_dy_xmu_tensor
+ *   - float - float - float - float - float - float - float - float.
+ *   - half - half - float - float - float - float - float - float.
+ *
+ * @par Data Layout
+ * - The supported data layout of \b dz, \b x, \b mean, \b invstd, \b dfilter, \b dbias, \b sum_dy and
+ *   \b sum_dy_xmu is as follows:
+ *   - dz tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC.
+ *   - x tensor: \p MLUOP_LAYOUT_NDHWC, \p MLUOP_LAYOUT_NHWC, \p MLUOP_LAYOUT_NLC, \p MLUOP_LAYOUT_NC.
+ *   - mean tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - invstd tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - dfilter tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - dbias tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy tensor: \p MLUOP_LAYOUT_ARRAY.
+ *   - sum_dy_xmu tensor: \p MLUOP_LAYOUT_ARRAY.
+ *
+ * @par Scale Limitation
+ * - None.
+ *
+ * @par API Dependency
+ * - None.
+ *
+ * @par note
+ * - The \b mean, \b invstd, \b dfilter, \b bias, \b sum_dy and \b sum_dy_xmu must be 1D tensors and the
+ *   length of the dimensions of these tensors should be the same as the length of the lowest dimension of \b x.
+ * - The length of each dimension of \b x and \b dz must be the same.
+ *
+ * @par Example
+ * - The example of ::mluOpSyncBatchNormBackwardReduce operation is as follows:
+     @verbatim
+      input four arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2
+      --> dz: [[[[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]],
+               [[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]]]]
+
+      --> x: [[[[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]],
+               [[3.0, 3.0],[3.0, 3.0],[3.0, 3.0]]]]
+
+      --> mean: [1, 1]
+
+      --> invstd: [0.8, 0.8]
+
+      output array by 2
+      --> dfilter: [57.6, 57.6]
+
+      --> dbias: [36.0, 36.0]
+
+      --> sum_dy: [36.0, 36.0]
+
+      --> sum_dy_xmu: [72.0, 72.0]
+     @endverbatim
+ *
+ * @par Reference
+ * - Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift,
+ *   Sergey Ioffe, 2015.
+ *
+ */
+mluOpStatus_t MLUOP_WIN_API
+mluOpSyncBatchNormBackwardReduce(mluOpHandle_t handle,
+                                 const mluOpTensorDescriptor_t desc_dz,
+                                 const void *dz,
+                                 const mluOpTensorDescriptor_t desc_x,
+                                 const void *x,
+                                 const mluOpTensorDescriptor_t desc_mean,
+                                 const void *mean,
+                                 const mluOpTensorDescriptor_t desc_invstd,
+                                 const void *invstd,
+                                 const mluOpTensorDescriptor_t desc_dfilter,
+                                 void *dfilter,
+                                 const mluOpTensorDescriptor_t desc_dbias,
+                                 void *dbias,
+                                 const mluOpTensorDescriptor_t desc_sum_dy,
+                                 void *sum_dy,
+                                 const mluOpTensorDescriptor_t desc_sum_dy_xmu,
+                                 void *sum_dy_xmu,
+                                 const bool needs_input_grad0,
+                                 const bool needs_input_grad1,
+                                 const bool needs_input_grad2);
+
+// Group: Deprecated APIs
 /*!
  * @brief Applies Synchronized Batch Normalization Reduce operator to backwardly compute grad filters,
  * grad bias, sum_dy and sum_dy_xmu on each MLU device.
@@ -12100,9 +12489,14 @@ mluOpSyncBatchnormBackwardReduce_v2(mluOpHandle_t handle,
  * Batch Normalization is used in CNN, including but not limited to
  * ResNet (Residual Network), Yolo (You Only Look Once) and R-CNN (Regions with CNN features).
  *
+ * @par Deprecated
+ * - ::mluOpSyncBatchnormBackwardReduce is deprecated and will be
+ *   removed in the future release. It is recommended to use
+ *   ::mluOpSyncBatchNormBackwardReduce instead.
+ *
  * @param[in] handle
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the
- * ::mluOpSyncBatchnormBackwardReduce operation. For detailed information, see ::mluOpHandle_t.
+ * ::mluOpSyncBatchNormBackwardReduce operation. For detailed information, see ::mluOpHandle_t.
  * @param[in] desc_dz
  * The descriptor of the input tensor \b dz. For detailed information, see
  * ::mluOpTensorDescriptor_t.
@@ -12197,7 +12591,7 @@ mluOpSyncBatchnormBackwardReduce_v2(mluOpHandle_t handle,
  * - The length of each dimension of \b x and \b dz must be the same.
  *
  * @par Example
- * - The example of ::mluOpSyncBatchnormBackwardReduce operation is as follows:
+ * - The example of ::mluOpSyncBatchNormBackwardReduce operation is as follows:
      @verbatim
       input four arrays by 1 * 2 * 3 * 2, 2, 2, 2 and 2
       --> dz: [[[[6.0, 6.0],[6.0, 6.0],[6.0, 6.0]],
diff --git a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
index 3b0fc7216..b0b415fd2 100644
--- a/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
+++ b/test/mlu_op_gtest/pb_gtest/src/zoo/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
@@ -33,7 +33,7 @@ void SyncBatchnormBackwardReduceExecutor::workspaceMalloc() {
   auto tensor_x = tensor_desc_[1].tensor;
   void *tmp = nullptr;
   // allocate extra nram space for deletion of CDMA
-  MLUOP_CHECK(mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(
+  MLUOP_CHECK(mluOpGetSyncBatchNormBackwardReduceWorkspaceSize(
       handle_, tensor_x, &workspace_size_));
   if (workspace_size_ > 0) {
     VLOG(4) << "Malloc workspace space for deletion of CDMA.";
@@ -160,15 +160,15 @@ void SyncBatchnormBackwardReduceExecutor::compute() {
   VLOG(4) << "Start to run mluOpSyncBatchNormBackwardReduce().";
   interface_timer_.start();
 #if 1
-  VLOG(4) << "launch mluOpSyncBatchnormBackwardReduce_v2.";
-  MLUOP_CHECK(mluOpSyncBatchnormBackwardReduce_v2(
+  VLOG(4) << "launch mluOpSyncBatchNormBackwardReduce_v2.";
+  MLUOP_CHECK(mluOpSyncBatchNormBackwardReduce_v2(
       handle_, desc_dz, dev_dz, desc_x, dev_x, desc_mean, dev_mean, desc_invstd,
       dev_invstd, workspace_[0], workspace_size_, desc_dweight, dev_dweight,
       desc_dbias, dev_dbias, desc_sum_dy, dev_sum_dy, desc_sum_dy_xmu,
       dev_sum_dy_xmu, needs_input_grad0, needs_input_grad1, needs_input_grad2));
 #else
-  VLOG(4) << "launch mluOpSyncBatchnormBackwardReduce.";
-  MLUOP_CHECK(mluOpSyncBatchnormBackwardReduce(
+  VLOG(4) << "launch mluOpSyncBatchNormBackwardReduce.";
+  MLUOP_CHECK(mluOpSyncBatchNormBackwardReduce(
       handle_, desc_dz, dev_dz, desc_x, dev_x, desc_mean, dev_mean, desc_invstd,
       dev_invstd, desc_dweight, dev_dweight, desc_dbias, dev_dbias, desc_sum_dy,
       dev_sum_dy, desc_sum_dy_xmu, dev_sum_dy_xmu, needs_input_grad0,