From 10143abcf5ffe9d5e0a4b20b43f7c30eeadf193a Mon Sep 17 00:00:00 2001
From: mahxn0 <1262384588@qq.com>
Date: Thu, 25 Jan 2024 11:50:14 +0800
Subject: [PATCH] [Feature](mlu-ops):modify cmake and kernels content.

---
 CMakeLists.txt                                |   27 +-
 kernel_depends.toml                           |    5 +-
 .../border_align_backward.cpp                 |  161 --
 .../border_align_backward.h                   |   34 -
 .../border_align_backward_union1.mlu          |  311 ----
 .../border_align_forward.cpp                  |  136 --
 .../border_align_forward.h                    |   37 -
 .../border_align_forward_union1.mlu           |  413 ------
 .../dcn_backward_data/dcn_backward_data.cpp   |  136 --
 .../dcn_backward_weight.cpp                   |  109 --
 kernels/dcn_forward/dcn_common.h              |   69 -
 kernels/dcn_forward/dcn_forward.cpp           |  103 --
 .../dynamic_point_to_voxel_backward.cpp       |  377 -----
 .../dynamic_point_to_voxel_backward.h         |   37 -
 ...dynamic_point_to_voxel_backward_union1.mlu |  201 ---
 .../dynamic_point_to_voxel_forward.cpp        |  337 -----
 .../dynamic_point_to_voxel_forward.h          |   39 -
 .../dynamic_point_to_voxel_forward_union1.mlu |  338 -----
 .../dynamic_point_to_voxel_mask_block.mlu     |  175 ---
 kernels/get_indice_pairs/get_indice_pairs.cpp |  252 ----
 .../get_indice_pairs_block.mlu                |  558 -------
 .../get_indice_pairs_structs.cpp              |  133 --
 .../get_indice_pairs_structs.h                |  100 --
 .../get_indice_pairs/get_indice_pairs_utils.h |  348 -----
 .../normal_get_indice_pairs.cpp               | 1299 -----------------
 .../normal_get_indice_pairs.h                 |   89 --
 .../indice_convolution_backward_data.cpp      |  904 ------------
 .../indice_convolution_backward_data.h        |   36 -
 .../indice_convolution_backward_filter.cpp    |  605 --------
 .../indice_convolution_forward.cpp            |  636 --------
 .../masked_col2im_forward.cpp                 |  346 -----
 .../masked_col2im_forward.h                   |   35 -
 .../masked_col2im_forward_union1.mlu          |  121 --
 .../masked_im2col_forward.cpp                 |  373 -----
 .../masked_im2col_forward.h                   |   36 -
 .../masked_im2col_forward_union1.mlu          |  100 --
 .../moe_dispatch_backward_data.cpp            |  220 ---
 .../moe_dispatch_backward_data.h              |   42 -
 .../moe_dispatch_backward_data_union1.mlu     |  339 -----
 .../moe_dispatch_backward_gate.cpp            |  260 ----
 .../moe_dispatch_backward_gate.h              |   42 -
 .../moe_dispatch_backward_gate_union1.mlu     |  387 -----
 .../moe_dispatch_forward.cpp                  |  200 ---
 .../moe_dispatch_forward.h                    |   35 -
 .../moe_dispatch_forward_block.mlu            |  155 --
 .../ms_deform_attn_backward.cpp               |  446 ------
 .../ms_deform_attn_backward.h                 |   61 -
 .../ms_deform_attn_backward_fast_union1.mlu   |  626 --------
 ...rm_attn_backward_small_channels_union1.mlu | 1012 -------------
 .../ms_deform_attn_backward_union1.mlu        |  296 ----
 .../ms_deform_attn_forward.h                  |   59 -
 .../ms_deform_attn_forward.mlu                |  340 -----
 .../ms_deform_attn_utils.h                    |  398 -----
 .../msda_forward_fast_union1.mlu              | 1280 ----------------
 .../msda_forward_small_channel_union1.mlu     |  557 -------
 .../msda_forward_union1_default.mlu           |  484 ------
 .../mutual_information_backward.cpp           |  863 -----------
 .../mutual_information_backward.h             |   45 -
 ...l_information_backward_3pipeline_block.mlu |  289 ----
 ...ual_information_backward_default_block.mlu |  455 ------
 .../mutual_information_backward_utils.h       |   49 -
 .../mutual_information_forward.cpp            |  741 ----------
 .../mutual_information_forward.h              |   41 -
 ...al_information_forward_3pipeline_block.mlu |  227 ---
 ...tual_information_forward_default_block.mlu |  307 ----
 .../mutual_information_forward_utils.h        |   73 -
 .../roi_align_backward/roi_align_backward.cpp |  109 --
 .../sync_batchnorm_backward_reduce.cpp        |  160 --
 .../roi_pooling_backward.cpp                  |   65 -
 .../roi_pooling_forward.cpp                   |   59 -
 kernels/roialign_forward/roialign_forward.cpp |  102 --
 .../sync_batch_norm_backward_elemt.cpp        |    0
 .../sync_batchnorm_backward_elemt_v2.cpp      |    0
 .../sync_batchnorm_backward_reduce.cpp        |    0
 .../sync_batchnorm_elemt.cpp                  |    0
 ...ync_batchnorm_gather_stats_with_counts.cpp |    0
 .../sync_batchnorm_stats.cpp                  |    0
 .../sync_batch_norm_backward_elemt.cpp        |   83 --
 .../sync_batchnorm_backward_elemt_v2.cpp      |   88 --
 .../sync_batchnorm_elemt.cpp                  |   73 -
 ...ync_batchnorm_gather_stats_with_counts.cpp |   89 --
 .../sync_batchnorm_stats.cpp                  |  112 --
 mlu_op.h                                      |   96 +-
 83 files changed, 71 insertions(+), 20310 deletions(-)
 delete mode 100644 kernels/border_align_backward/border_align_backward.cpp
 delete mode 100644 kernels/border_align_backward/border_align_backward.h
 delete mode 100644 kernels/border_align_backward/border_align_backward_union1.mlu
 delete mode 100644 kernels/border_align_forward/border_align_forward.cpp
 delete mode 100644 kernels/border_align_forward/border_align_forward.h
 delete mode 100644 kernels/border_align_forward/border_align_forward_union1.mlu
 delete mode 100755 kernels/dcn_backward_data/dcn_backward_data.cpp
 delete mode 100644 kernels/dcn_backward_weight/dcn_backward_weight.cpp
 delete mode 100644 kernels/dcn_forward/dcn_common.h
 delete mode 100644 kernels/dcn_forward/dcn_forward.cpp
 delete mode 100644 kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp
 delete mode 100644 kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h
 delete mode 100644 kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu
 delete mode 100644 kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp
 delete mode 100644 kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.h
 delete mode 100644 kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu
 delete mode 100644 kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu
 delete mode 100644 kernels/get_indice_pairs/get_indice_pairs.cpp
 delete mode 100644 kernels/get_indice_pairs/get_indice_pairs_block.mlu
 delete mode 100644 kernels/get_indice_pairs/get_indice_pairs_structs.cpp
 delete mode 100644 kernels/get_indice_pairs/get_indice_pairs_structs.h
 delete mode 100644 kernels/get_indice_pairs/get_indice_pairs_utils.h
 delete mode 100644 kernels/get_indice_pairs/normal_get_indice_pairs.cpp
 delete mode 100644 kernels/get_indice_pairs/normal_get_indice_pairs.h
 delete mode 100644 kernels/indice_convolution_backward_data/indice_convolution_backward_data.cpp
 delete mode 100644 kernels/indice_convolution_backward_data/indice_convolution_backward_data.h
 delete mode 100644 kernels/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp
 delete mode 100644 kernels/indice_convolution_forward/indice_convolution_forward.cpp
 delete mode 100644 kernels/masked_col2im_forward/masked_col2im_forward.cpp
 delete mode 100644 kernels/masked_col2im_forward/masked_col2im_forward.h
 delete mode 100644 kernels/masked_col2im_forward/masked_col2im_forward_union1.mlu
 delete mode 100644 kernels/masked_im2col_forward/masked_im2col_forward.cpp
 delete mode 100644 kernels/masked_im2col_forward/masked_im2col_forward.h
 delete mode 100644 kernels/masked_im2col_forward/masked_im2col_forward_union1.mlu
 delete mode 100644 kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp
 delete mode 100644 kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.h
 delete mode 100644 kernels/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu
 delete mode 100644 kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp
 delete mode 100644 kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.h
 delete mode 100644 kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu
 delete mode 100644 kernels/moe_dispatch_forward/moe_dispatch_forward.cpp
 delete mode 100644 kernels/moe_dispatch_forward/moe_dispatch_forward.h
 delete mode 100644 kernels/moe_dispatch_forward/moe_dispatch_forward_block.mlu
 delete mode 100644 kernels/ms_deform_attn_backward/ms_deform_attn_backward.cpp
 delete mode 100644 kernels/ms_deform_attn_backward/ms_deform_attn_backward.h
 delete mode 100644 kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu
 delete mode 100644 kernels/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu
 delete mode 100644 kernels/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu
 delete mode 100644 kernels/ms_deform_attn_forward/ms_deform_attn_forward.h
 delete mode 100644 kernels/ms_deform_attn_forward/ms_deform_attn_forward.mlu
 delete mode 100644 kernels/ms_deform_attn_forward/ms_deform_attn_utils.h
 delete mode 100644 kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu
 delete mode 100644 kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu
 delete mode 100644 kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu
 delete mode 100644 kernels/mutual_information_backward/mutual_information_backward.cpp
 delete mode 100644 kernels/mutual_information_backward/mutual_information_backward.h
 delete mode 100644 kernels/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu
 delete mode 100644 kernels/mutual_information_backward/mutual_information_backward_default_block.mlu
 delete mode 100644 kernels/mutual_information_backward/mutual_information_backward_utils.h
 delete mode 100644 kernels/mutual_information_forward/mutual_information_forward.cpp
 delete mode 100644 kernels/mutual_information_forward/mutual_information_forward.h
 delete mode 100644 kernels/mutual_information_forward/mutual_information_forward_3pipeline_block.mlu
 delete mode 100644 kernels/mutual_information_forward/mutual_information_forward_default_block.mlu
 delete mode 100644 kernels/mutual_information_forward/mutual_information_forward_utils.h
 delete mode 100644 kernels/roi_align_backward/roi_align_backward.cpp
 delete mode 100644 kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
 delete mode 100644 kernels/roi_pooling_backward/roi_pooling_backward.cpp
 delete mode 100644 kernels/roi_pooling_forward/roi_pooling_forward.cpp
 delete mode 100644 kernels/roialign_forward/roialign_forward.cpp
 rename kernels/{roi_pooling/sync_batchnorm => sync_batch_norm}/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp (100%)
 rename kernels/{roi_pooling/sync_batchnorm => sync_batch_norm}/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp (100%)
 rename kernels/{ => sync_batch_norm}/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp (100%)
 rename kernels/{roi_pooling/sync_batchnorm => sync_batch_norm}/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp (100%)
 rename kernels/{roi_pooling/sync_batchnorm => sync_batch_norm}/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp (100%)
 rename kernels/{roi_pooling/sync_batchnorm => sync_batch_norm}/sync_batchnorm_stats/sync_batchnorm_stats.cpp (100%)
 delete mode 100644 kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
 delete mode 100644 kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
 delete mode 100644 kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
 delete mode 100644 kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
 delete mode 100644 kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c105f802a..2755c5f2b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,17 +130,30 @@ endif()
 
 list(SORT build_kernel)
 message(STATUS "build_kernel:[${build_kernel}]")
-
-foreach(kernel ${build_kernel})
-  if (NOT IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/kernels/${kernel}")
+ 
+file(GLOB all_kernels "${CMAKE_CURRENT_LIST_DIR}/kernels/*")
+foreach(kernel ${build_kernel} )
+  set(kernel_parent_dir '')
+  foreach (o ${all_kernels})
+    if (IS_DIRECTORY ${o})
+      get_filename_component(kernelname ${o} NAME)
+      if(IS_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/kernels/${kernel} OR
+         IS_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/kernels/${kernelname}/${kernel})
+        set(kernel_parent_dir ${kernelname})
+      endif()
+    endif()
+  endforeach ()
+  if (IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/kernels/${kernel}")
+    file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/*.mlu")
+  elseif(IS_DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/kernels/${kernel_parent_dir}/${kernel}")
+    file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel_parent_dir}/${kernel}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel_parent_dir}/${kernel}/*.mlu")
+  else()
     message(WARNING "kernel/${kernel} is not a directory, ${kernel} is an alias")
     continue()
   endif()
-  file(GLOB_RECURSE src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/*.mlu")
-  file(GLOB_RECURSE src_helper_files ${src_helper_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/utils/cnnl_helper.cpp")
-  file(GLOB_RECURSE arch_binary_files ${arch_binary_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kernel}/${MLUOP_TARGET_CPU_ARCH}/*.o")
 endforeach()
-
+ 
+file(GLOB_RECURSE src_helper_files ${src_helper_files} "${CMAKE_CURRENT_SOURCE_DIR}/kernels/utils/cnnl_helper.cpp")
 file(GLOB_RECURSE core_src_files ${core_src_files} "${CMAKE_CURRENT_SOURCE_DIR}/core/*.cpp")
 # set(src_files ${src_files} "${CMAKE_CURRENT_SOURCE_DIR}/test/main.cpp")
 
diff --git a/kernel_depends.toml b/kernel_depends.toml
index 7dc5a0441..a325f0c76 100755
--- a/kernel_depends.toml
+++ b/kernel_depends.toml
@@ -41,5 +41,6 @@ deform_roi_pool_forward = ["deform_roi_pool"]
 deform_roi_pool_backward = ["deform_roi_pool"]
 carafe_forward = ["carafe"]
 carafe_backward = ["carafe"]
-dcn_backward_weight = ["dcn_forward"]
-dcn_backward_data = ["dcn_forward"]
+dcn_backward_weight = ["dcn_common"]
+dcn_backward_data = ["dcn_common"]
+dcn_forward = ["dcn_common"]
diff --git a/kernels/border_align_backward/border_align_backward.cpp b/kernels/border_align_backward/border_align_backward.cpp
deleted file mode 100644
index 1dadcedd0..000000000
--- a/kernels/border_align_backward/border_align_backward.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "border_align_backward.h"
-
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"
-#include "core/tool.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-
-static void policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim,
-                       cnrtFunctionType_t *k_type) {
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  k_dim->y = mluop::runtime::getClusterLimitCapability(handle);
-  k_dim->z = 1;
-}
-
-mluOpStatus_t mluOpBorderAlignBackward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t grad_output_desc,
-    const void *grad_output, const mluOpTensorDescriptor_t boxes_desc,
-    const void *boxes, const mluOpTensorDescriptor_t argmax_idx_desc,
-    const void *argmax_idx, const int32_t pool_size,
-    const mluOpTensorDescriptor_t grad_input_desc, void *grad_input) {
-  const std::string API = "[mluOpBorderAlignBackward]";
-  // params check
-  PARAM_CHECK(API, handle != nullptr);
-  PARAM_CHECK(API, grad_output_desc != nullptr);
-  PARAM_CHECK(API, boxes_desc != nullptr);
-  PARAM_CHECK(API, argmax_idx_desc != nullptr);
-  PARAM_CHECK(API, grad_input_desc != nullptr);
-
-  PARAM_CHECK(API, grad_output_desc->dim == 4);
-  PARAM_CHECK(API, boxes_desc->dim == 3);
-  PARAM_CHECK(API, argmax_idx_desc->dim == 4);
-  PARAM_CHECK(API, grad_input_desc->dim == 4);
-
-  const int32_t border_num = 4;
-  const int32_t coord_num = 4;
-  const int32_t origin_n = grad_input_desc->dims[0];
-  const int32_t origin_h = grad_input_desc->dims[1];
-  const int32_t origin_w = grad_input_desc->dims[2];
-  const int32_t origin_c = grad_input_desc->dims[3] / border_num;
-  const int32_t origin_k = boxes_desc->dims[1];
-
-  PARAM_CHECK(API, grad_output_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                       grad_output_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK(API, argmax_idx_desc->dtype == MLUOP_DTYPE_INT32);
-  PARAM_CHECK(API, boxes_desc->dtype == grad_output_desc->dtype);
-  PARAM_CHECK(API, grad_input_desc->dtype == grad_output_desc->dtype);
-
-  PARAM_CHECK(API, grad_output_desc->layout == MLUOP_LAYOUT_NHWC);
-  PARAM_CHECK(API, argmax_idx_desc->layout == MLUOP_LAYOUT_NHWC);
-  PARAM_CHECK(API, grad_input_desc->layout == MLUOP_LAYOUT_NHWC);
-
-  PARAM_CHECK(API, grad_input_desc->dims[3] % border_num == 0);
-  PARAM_CHECK_NE(API, origin_n, 0);
-  PARAM_CHECK_NE(API, origin_c, 0);
-  PARAM_CHECK_NE(API, origin_h, 0);
-  PARAM_CHECK_NE(API, origin_w, 0);
-  PARAM_CHECK(API, origin_h * origin_w == origin_k);
-  PARAM_CHECK(API, boxes_desc->dim == 3);
-  PARAM_CHECK(API, boxes_desc->dims[2] == coord_num);
-  PARAM_CHECK_NE(API, origin_k, 0);
-  PARAM_CHECK_GT(API, pool_size, 0);
-
-  PARAM_CHECK_EQ(API, grad_output_desc->dims[0], origin_n);
-  PARAM_CHECK_EQ(API, grad_output_desc->dims[1], origin_k);
-  PARAM_CHECK_EQ(API, grad_output_desc->dims[2], border_num);
-  PARAM_CHECK_EQ(API, grad_output_desc->dims[3], origin_c);
-
-  PARAM_CHECK_EQ(API, boxes_desc->dims[0], origin_n);
-  PARAM_CHECK_EQ(API, boxes_desc->dims[1], origin_k);
-  PARAM_CHECK_EQ(API, boxes_desc->dims[2], coord_num);
-
-  PARAM_CHECK_EQ(API, argmax_idx_desc->dims[0], origin_n);
-  PARAM_CHECK_EQ(API, argmax_idx_desc->dims[1], origin_k);
-  PARAM_CHECK_EQ(API, argmax_idx_desc->dims[2], border_num);
-  PARAM_CHECK_EQ(API, argmax_idx_desc->dims[3], origin_c);
-
-  TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(grad_output_desc),
-                   LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(boxes_desc), LARGE_TENSOR_NUM,
-                   "");
-  TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(grad_input_desc),
-                   LARGE_TENSOR_NUM, "");
-
-  PARAM_CHECK(API, grad_output != nullptr);
-  PARAM_CHECK(API, boxes != nullptr);
-  PARAM_CHECK(API, argmax_idx != nullptr);
-  PARAM_CHECK(API, grad_input != nullptr);
-
-  // generate case prototxt
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("border_align_backward");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA(true, "input1", grad_output, grad_output_desc, 100, 0);
-    GEN_CASE_DATA_REAL(true, "input2", boxes, boxes_desc);
-    GEN_CASE_DATA_REAL(true, "input3", argmax_idx, argmax_idx_desc);
-    GEN_CASE_DATA(false, "output1", grad_input, grad_input_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "border_align_backward", "pool_size",
-                             pool_size);
-    GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-  }
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(handle, &k_dim, &k_type);
-
-  VLOG(5) << "[mluOpBorderAlignBackward] cnnlFill_v3 start.";
-  uint64_t fill_value = 0x0;
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_input_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, grad_input));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  VLOG(5) << "[mluOpBorderAlignBackward] cnnlFill_v3 end.";
-  mluOpDataType_t input_dtype = grad_output_desc->dtype;
-
-  VLOG(5) << "Launch Kernel KernelBorderAlignBackward<<<Union"
-          << k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", "
-          << k_dim.z << ">>>";
-  CHECK_RETURN(
-      API, KernelBorderAlignBackward(
-               k_dim, k_type, handle->queue, input_dtype, (void *)grad_output,
-               (void *)boxes, (int32_t *)argmax_idx, pool_size, origin_n,
-               origin_h, origin_w, origin_c, origin_k, (void *)grad_input));
-
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/border_align_backward/border_align_backward.h b/kernels/border_align_backward/border_align_backward.h
deleted file mode 100644
index 316e20f82..000000000
--- a/kernels/border_align_backward/border_align_backward.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_BORDER_ALIGN_BACKWARD_BORDER_ALIGN_BACKWARD_H_
-#define KERNELS_BORDER_ALIGN_BACKWARD_BORDER_ALIGN_BACKWARD_H_
-
-#include "mlu_op.h"
-
-mluOpStatus_t MLUOP_WIN_API KernelBorderAlignBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *grad_output, const void *boxes,
-    const int32_t *argmax_idx, const int32_t pool_size, const int32_t origin_n,
-    const int32_t origin_h, const int32_t origin_w, const int32_t origin_c,
-    const int32_t origin_k, void *grad_input);
-#endif  // KERNELS_BORDER_ALIGN_BACKWARD_BORDER_ALIGN_BACKWARD_H_
diff --git a/kernels/border_align_backward/border_align_backward_union1.mlu b/kernels/border_align_backward/border_align_backward_union1.mlu
deleted file mode 100644
index 61c23ad01..000000000
--- a/kernels/border_align_backward/border_align_backward_union1.mlu
+++ /dev/null
@@ -1,311 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "border_align_backward.h"
-
-#include "core/logging.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-#define BORDER_NUM 4
-#define CALCULATE_GRAD_INPUT(w, x, y)                                          \
-  const int32_t offset_##w = n * origin_h * origin_w * origin_c * BORDER_NUM + \
-                             y * origin_w * origin_c * BORDER_NUM +            \
-                             x * origin_c * BORDER_NUM + border * origin_c +   \
-                             c;                                                \
-  __bang_mul_scalar(nram_grad_input, nram_grad_output, w, deal_num_align);     \
-  __bang_band((char *)nram_grad_input, (char *)nram_grad_input, (char *)mask,  \
-              sizeof(T) * deal_num_align);                                     \
-  __bang_atomic_reduce_add(grad_input + offset_##w, nram_grad_input, deal_num);
-
-template <typename T>
-__mlu_func__ void computeGradInput(
-    T *nram_grad_input, T *nram_grad_output, T *grad_input, T *mask, const T w1,
-    const T w2, const T w3, const T w4, const int32_t x_low,
-    const int32_t y_low, const int32_t x_high, const int32_t y_high,
-    const int32_t origin_c, const int32_t c, const int32_t origin_w,
-    const int32_t n, const int32_t origin_h, const int32_t border,
-    const int32_t deal_num, const int32_t deal_num_align) {
-  /* bilinear-interpolation:
-   *   v1 = input_HW[y_low,  x_low]
-   *   v2 = input_HW[y_low,  x_high]
-   *   v3 = input_HW[y_high, x_low]
-   *   v4 = input_HW[y_high, x_high]
-   *
-   * forward:
-   *    output_value = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
-   * backwrad:
-   *    v1.atomicAdd(grad_output_value * w1)
-   *    ...
-   *    v4.atomicAdd(grad_output_value * w4)
-   */
-  CALCULATE_GRAD_INPUT(w1, x_low, y_low);
-  CALCULATE_GRAD_INPUT(w2, x_high, y_low);
-  CALCULATE_GRAD_INPUT(w3, x_low, y_high);
-  CALCULATE_GRAD_INPUT(w4, x_high, y_high);
-}
-
-template <typename T>
-__mlu_func__ void bilinearInterpolate(const int32_t input_height,
-                                      const int32_t input_width, T y, T x,
-                                      T *w1, T *w2, T *w3, T *w4,
-                                      int32_t *x_low, int32_t *x_high,
-                                      int32_t *y_low, int32_t *y_high,
-                                      bool *empty) {
-  // deal with case that the point is out of feature map boundary
-  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
-    *empty = true;
-    *w1 = *w2 = *w3 = *w4 = 0;
-    *x_low = *x_high = *y_low = *y_high = -1;
-    return;
-  }
-  *empty = false;
-  if (y <= 0) y = (T)0;
-  if (x <= 0) x = (T)0;
-
-  *y_low = int32_t(y);
-  *x_low = int32_t(x);
-
-  if (*y_low >= input_height - 1) {
-    *y_high = *y_low = input_height - 1;
-    y = (T)(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-
-  if (*x_low >= input_width - 1) {
-    *x_high = *x_low = input_width - 1;
-    x = T(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  T ly = y - *y_low;
-  T lx = x - *x_low;
-  T hy = 1.0 - ly;
-  T hx = 1.0 - lx;
-  *w1 = hy * hx;
-  *w2 = hy * lx;
-  *w3 = ly * hx;
-  *w4 = ly * lx;
-}
-
-template <typename T>
-__mlu_func__ void computeImpl(T *nram_grad_output, const T *grad_output,
-                              int32_t *nram_argmax_idx,
-                              const int32_t *argmax_idx, T *grad_input,
-                              T *nram_grad_input, const T *nram_boxes,
-                              const int32_t n, const int32_t c, const int32_t k,
-                              const int32_t border, const int32_t origin_k,
-                              const int32_t origin_n, const int32_t origin_c,
-                              const int32_t origin_h, const int32_t origin_w,
-                              const int32_t pool_size, const int32_t deal_num,
-                              const int32_t deal_num_align) {
-  // argmax_idx, grad_output offset num
-  const int32_t src_offset = n * origin_k * origin_c * BORDER_NUM +
-                             k * origin_c * BORDER_NUM + border * origin_c + c;
-
-  // bilinear_interpolate params
-  int32_t x_low = 0, x_high = 0;
-  int32_t y_low = 0, y_high = 0;
-  bool empty = false;
-  T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
-
-  const T x_start = *(nram_boxes + border / 2 * 2);
-  const T y_start = *(nram_boxes + 1 + border / 2 * 2);
-  const T box_width = *((T *)nram_boxes + 2) - *(T *)nram_boxes;
-  const T box_height = *((T *)nram_boxes + 3) - *((T *)nram_boxes + 1);
-  T x_stride = 0;
-  T y_stride = 0;
-  switch (border) {
-    case 0: {  // Top
-      x_stride = box_width / pool_size;
-      y_stride = 0;
-    } break;
-    case 1: {  // Left
-      x_stride = 0;
-      y_stride = box_height / pool_size;
-    } break;
-    case 2: {  // Bottom
-      x_stride = -box_width / pool_size;
-      y_stride = 0;
-    } break;
-    case 3: {  // Right
-      x_stride = 0;
-      y_stride = -box_height / pool_size;
-    } break;
-  }
-
-  // layer 2: loop over range[0, pool_size]
-  for (int32_t i = 0; i < pool_size + 1; ++i) {
-    const T x = x_start + x_stride * i;
-    const T y = y_start + y_stride * i;
-    bilinearInterpolate(origin_h, origin_w, y, x, &w1, &w2, &w3, &w4, &x_low,
-                        &x_high, &y_low, &y_high, &empty);
-    if (!empty) {
-      // load argmax,
-      __memcpy(nram_argmax_idx, argmax_idx + src_offset,
-               deal_num * sizeof(int32_t), GDRAM2NRAM);  // NOLINT
-
-      /* Creat mask, mask.shape([1, deal_num]) is the same as argmax_idx
-       * mask[1, j] = (T)1  if (argmax_idx[1, j] == pool_idx)
-       *            = (T)0  otherwise
-       */
-      __bang_write_value(nram_grad_output, deal_num_align, int32_t(i));
-      __bang_eq(nram_argmax_idx, nram_argmax_idx, (int32_t *)nram_grad_output,
-                deal_num_align);  // NOLINT
-      if (__mluop_is_float<T>()) {
-        __nram__ int32_t table[COMPUTE_COUNT_ALIGN] = {0, (int32_t)0xffffffff};
-        __bang_lut_s32((int32_t *)nram_argmax_idx, (int32_t *)nram_argmax_idx,
-                       table, deal_num_align, COMPUTE_COUNT_ALIGN);  // NOLINT
-      } else {
-        __nram__ int16_t table[COMPUTE_COUNT_ALIGN] = {0, (int16_t)0xffff};
-        __bang_int322int16((int16_t *)nram_argmax_idx,
-                           (int32_t *)nram_argmax_idx, deal_num_align, 0,
-                           0);  // NOLINT
-        __bang_lut_s16((int16_t *)nram_argmax_idx, (int16_t *)nram_argmax_idx,
-                       table, deal_num_align, COMPUTE_COUNT_ALIGN);  // NOLINT
-      }
-
-      // load grad_output, and calculate grad_input
-      __memcpy(nram_grad_output, grad_output + src_offset, deal_num * sizeof(T),
-               GDRAM2NRAM);  // NOLINT
-      computeGradInput(nram_grad_input, nram_grad_output, grad_input,
-                       (T *)nram_argmax_idx, w1, w2, w3, w4, x_low, y_low,
-                       x_high, y_high, origin_c, c, origin_w, n, origin_h,
-                       border, deal_num, deal_num_align);
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUKernelBorderAlignBackward(
-    const T *grad_output, const T *boxes, const int32_t *argmax_idx,
-    const int32_t pool_size, const int32_t origin_n, const int32_t origin_h,
-    const int32_t origin_w, const int32_t origin_c, const int32_t origin_k,
-    T *grad_input) {
-  // unused MPU
-  if (__is_mpu()) {
-    return;
-  }
-
-  /*
-   * NRAM partition
-   *  |=============|=======================|
-   *  | Semantics   | Size                  |
-   *  |=============|=======================|
-   *  | grad_output | deal_num * sizeof(T)  |
-   *  |-------------|-----------------------|
-   *  | grad_intput | deal_num * sizeof(T)  |
-   *  |-------------|-----------------------|
-   *  | argmax_idx  | deal_num * sizeof(int)|
-   *  |-------------|-----------------------|
-   *  | boxes       | 128byte               |
-   *  |-------------|-----------------------|
-   */
-  const int32_t deal_num = PAD_DOWN(
-      (MAX_NRAM_SIZE - NFU_ALIGN_SIZE) / (2 * sizeof(T) + 1 * sizeof(int32_t)),
-      NFU_ALIGN_SIZE);
-  T *nram_boxes = (T *)nram_buffer;
-  T *nram_grad_output = (T *)((char *)nram_buffer + NFU_ALIGN_SIZE);
-  T *nram_grad_input = (T *)nram_grad_output + deal_num;
-  int32_t *nram_argmax_idx = (int32_t *)((T *)nram_grad_input + deal_num);
-
-  /*
-   * grad_output.shape = [origin_n, origin_k, border_num, origin_c]
-   * boxes.shape       = [origin_n, origin_k, coord_num]
-   * argmax_idx.shape  = [origin_n, origin_k, border_num, origin_c]
-   * coord_num  = 4;
-   * border_num = 4; [0:Top, 1:Left, 2:Bottom, 3:Right]
-   *
-   * Partition output:
-   *   Split the num of boxes(origin_n * origin_k * border_num) among taskDim,
-   *   Mulitple core load the different part of the output
-   *   in each loop.
-   *
-   * Calculation process:
-   *  layer 0: 0 ~ origin_n * origin_k * border_num
-   *  layer 1: 0 ~ origin_c
-   *  layer 2: 0 ~ pool_size
-   */
-  const int32_t coord_num = 4;
-  const int32_t total_num = origin_n * origin_k * BORDER_NUM;
-  const int32_t num_per_core =
-      total_num / taskDim + int32_t((total_num % taskDim) > taskId);
-
-  // layer 0: loop over range[0, origin_n * origin_k * border_num)
-  for (int32_t i = 0; i < num_per_core; ++i) {
-    const int32_t idx = taskId + i * taskDim;
-    const int32_t n = idx / origin_k / BORDER_NUM;
-    const int32_t k = idx / BORDER_NUM % origin_k;
-    const int32_t border_idx = idx % BORDER_NUM;
-
-    /* load boxes:
-     *     boxes[n,k,0:4] indicates the information on the bottom left
-     *     and top right points: [lb_x, lb_y, rt_x, rt_y]
-     */
-    __memcpy(nram_boxes, (T *)boxes + n * origin_k * coord_num + k * coord_num,
-             coord_num * sizeof(T), GDRAM2NRAM);
-
-    // layer 1: loop over range[0, origin_c)
-    const int32_t c_repeat = origin_c / deal_num;
-    const int32_t c_rem = origin_c % deal_num;
-    for (int32_t c_seg_idx = 0; c_seg_idx < c_repeat; ++c_seg_idx) {
-      computeImpl((T *)nram_grad_output, (T *)grad_output,
-                  (int32_t *)nram_argmax_idx, (int32_t *)argmax_idx,
-                  (T *)grad_input, (T *)nram_grad_input, nram_boxes, n,
-                  c_seg_idx * deal_num, k, border_idx, origin_k, origin_n,
-                  origin_c, origin_h, origin_w, pool_size, deal_num, deal_num);
-    }
-    if (c_rem != 0) {
-      const int32_t c_rem_align = PAD_UP(c_rem, NFU_ALIGN_SIZE);
-      computeImpl((T *)nram_grad_output, (T *)grad_output,
-                  (int32_t *)nram_argmax_idx, (int32_t *)argmax_idx,
-                  (T *)grad_input, (T *)nram_grad_input, nram_boxes, n,
-                  origin_c - c_rem, k, border_idx, origin_k, origin_n, origin_c,
-                  origin_h, origin_w, pool_size, c_rem, c_rem_align);
-    }
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelBorderAlignBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t data_type, const void *grad_output, const void *boxes,
-    const int32_t *argmax_idx, const int32_t pool_size, const int32_t origin_n,
-    const int32_t origin_h, const int32_t origin_w, const int32_t origin_c,
-    const int32_t origin_k, void *grad_input) {
-  // launch kernel
-  if (data_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) {
-    KERNEL_CHECK(MLUKernelBorderAlignBackward<<<k_dim, k_type, queue>>>(
-        (float *)grad_output, (float *)boxes, (int32_t *)argmax_idx, pool_size,
-        origin_n, origin_h, origin_w, origin_c, origin_k, (float *)grad_input));
-
-  } else {
-    // half
-    KERNEL_CHECK(MLUKernelBorderAlignBackward<<<k_dim, k_type, queue>>>(
-        (half *)grad_output, (half *)boxes, (int32_t *)argmax_idx, pool_size,
-        origin_n, origin_h, origin_w, origin_c, origin_k, (half *)grad_input));
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/border_align_forward/border_align_forward.cpp b/kernels/border_align_forward/border_align_forward.cpp
deleted file mode 100644
index 86af97dca..000000000
--- a/kernels/border_align_forward/border_align_forward.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*******************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS self.tcp LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *******************************************************************************/
-#include "border_align_forward.h"
-
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"
-
-// policyFunc
-static void policyFunc(mluOpHandle_t handle, cnrtDim3_t *k_dim,
-                       cnrtFunctionType_t *k_type) {
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  k_dim->y = mluop::runtime::getClusterLimitCapability(handle);
-  k_dim->z = 1;
-}
-
-mluOpStatus_t mluOpBorderAlignForward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc,
-    const void *input, const mluOpTensorDescriptor_t boxes_desc,
-    const void *boxes, const int32_t pool_size,
-    const mluOpTensorDescriptor_t output_desc, void *output,
-    const mluOpTensorDescriptor_t argmax_idx_desc, void *argmax_idx) {
-  const std::string API = "[mluOpBorderAlignForward]";
-  PARAM_CHECK(API, handle != nullptr);
-  PARAM_CHECK(API, input_desc != nullptr);
-  PARAM_CHECK(API, boxes_desc != nullptr);
-  PARAM_CHECK(API, output_desc != nullptr);
-  PARAM_CHECK(API, argmax_idx_desc != nullptr);
-
-  PARAM_CHECK(API, input_desc->dim == 4);
-  PARAM_CHECK(API, boxes_desc->dim == 3);
-  PARAM_CHECK(API, output_desc->dim == 4);
-  PARAM_CHECK(API, argmax_idx_desc->dim == 4);
-
-  const int32_t border_num = 4;
-  const int32_t coord_num = 4;
-  const int32_t origin_n = input_desc->dims[0];
-  const int32_t origin_h = input_desc->dims[1];
-  const int32_t origin_w = input_desc->dims[2];
-  const int32_t origin_c = input_desc->dims[3] / border_num;
-  const int32_t origin_k = boxes_desc->dims[1];
-
-  PARAM_CHECK(API, input_desc->dtype == boxes_desc->dtype);
-  PARAM_CHECK(API, input_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                       input_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK(API, boxes_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                       boxes_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK(API, output_desc->dtype == input_desc->dtype);
-  PARAM_CHECK(API, argmax_idx_desc->dtype == MLUOP_DTYPE_INT32);
-
-  PARAM_CHECK(API, input_desc->layout == MLUOP_LAYOUT_NHWC);
-  PARAM_CHECK(API, output_desc->layout == MLUOP_LAYOUT_NHWC);
-  PARAM_CHECK(API, argmax_idx_desc->layout == MLUOP_LAYOUT_NHWC);
-
-  PARAM_CHECK(API, input_desc->dims[3] % border_num == 0);
-  PARAM_CHECK_NE(API, origin_n, 0);
-  PARAM_CHECK_NE(API, origin_c, 0);
-  PARAM_CHECK_NE(API, origin_h, 0);
-  PARAM_CHECK_NE(API, origin_w, 0);
-  PARAM_CHECK_NE(API, origin_k, 0);
-  PARAM_CHECK(API, boxes_desc->dim == 3);
-  PARAM_CHECK(API, boxes_desc->dims[2] == coord_num);
-
-  PARAM_CHECK(API, origin_n == boxes_desc->dims[0]);
-  PARAM_CHECK(API, origin_h * origin_w == origin_k);
-  PARAM_CHECK_EQ(API, output_desc->dims[0], origin_n);
-  PARAM_CHECK_EQ(API, output_desc->dims[1], origin_k);
-  PARAM_CHECK_EQ(API, output_desc->dims[2], border_num);
-  PARAM_CHECK_EQ(API, output_desc->dims[3], origin_c);
-  PARAM_CHECK_EQ(API, argmax_idx_desc->dims[0], origin_n);
-  PARAM_CHECK_EQ(API, argmax_idx_desc->dims[1], origin_k);
-  PARAM_CHECK_EQ(API, argmax_idx_desc->dims[2], border_num);
-  PARAM_CHECK_EQ(API, argmax_idx_desc->dims[3], origin_c);
-
-  const size_t input_num = mluOpGetTensorElementNum(input_desc);
-  const size_t boxes_num = mluOpGetTensorElementNum(boxes_desc);
-  const size_t output_num = mluOpGetTensorElementNum(output_desc);
-  TENSOR_NUM_CHECK(API, input_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(API, boxes_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(API, output_num, LARGE_TENSOR_NUM, "");
-
-  PARAM_CHECK(API, input != nullptr);
-  PARAM_CHECK(API, boxes != nullptr);
-  PARAM_CHECK(API, output != nullptr);
-  PARAM_CHECK(API, argmax_idx != nullptr);
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("border_align_forward");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA(true, "input1", input, input_desc, 100, 0);
-    GEN_CASE_DATA_REAL(true, "input2", boxes, boxes_desc);
-    GEN_CASE_DATA(false, "output1", output, output_desc, 0, 0);
-    GEN_CASE_DATA(false, "output2", argmax_idx, argmax_idx_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "border_align_forward", "pool_size", pool_size);
-    GEN_CASE_TEST_PARAM_NEW(false, false, true, 0.003, 0, 0);
-  }
-
-  cnrtFunctionType_t k_type;
-  cnrtDim3_t k_dim;
-  policyFunc(handle, &k_dim, &k_type);
-
-  VLOG(5) << "Launch Kernel KernelBorderAlignForward<<<Union"
-          << k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", "
-          << k_dim.z << ">>>";
-  CHECK_RETURN(API, KernelBorderAlignForward(
-                        k_dim, k_type, handle->queue, input_desc->dtype, input,
-                        boxes, pool_size, origin_n, origin_h, origin_w,
-                        origin_c, origin_k, output, (int32_t *)argmax_idx));
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/border_align_forward/border_align_forward.h b/kernels/border_align_forward/border_align_forward.h
deleted file mode 100644
index a7e146dd3..000000000
--- a/kernels/border_align_forward/border_align_forward.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*******************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS self.tcp LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *******************************************************************************/
-#ifndef KERNELS_BORDER_ALIGN_FORWARD_BORDER_ALIGN_FORWARD_H_
-#define KERNELS_BORDER_ALIGN_FORWARD_BORDER_ALIGN_FORWARD_H_
-
-#include "mlu_op.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-
-mluOpStatus_t MLUOP_WIN_API KernelBorderAlignForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *input, const void *boxes,
-    const int32_t pool_size, const int32_t origin_n, const int32_t origin_h,
-    const int32_t origin_w, const int32_t origin_c, const int32_t origin_k,
-    void *output, int32_t *argmax_idx_nram);
-
-#endif  // KERNELS_BORDER_ALIGN_FORWARD_BORDER_ALIGN_FORWARD_H_
diff --git a/kernels/border_align_forward/border_align_forward_union1.mlu b/kernels/border_align_forward/border_align_forward_union1.mlu
deleted file mode 100644
index b18449b6f..000000000
--- a/kernels/border_align_forward/border_align_forward_union1.mlu
+++ /dev/null
@@ -1,413 +0,0 @@
-/*******************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS self.tcp LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *******************************************************************************/
-#include "border_align_forward.h"
-
-#include "core/logging.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-#define BORDER_NUM 4
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void bilinearInterpolate(const int32_t input_height,
-                                      const int32_t input_width, T x, T y,
-                                      T *w1, T *w2, T *w3, T *w4,
-                                      int32_t *x_low, int32_t *x_high,
-                                      int32_t *y_low, int32_t *y_high,
-                                      bool *empty) {
-  // deal with case that the point is out of feature map boundary
-  // https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp#L29
-  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
-    *empty = true;
-    return;
-  }
-  *empty = false;
-  if (y <= 0) y = (T)0;
-  if (x <= 0) x = (T)0;
-
-  *y_low = int32_t(y);
-  *x_low = int32_t(x);
-
-  if (*y_low >= input_height - 1) {
-    *y_high = *y_low = input_height - 1;
-    y = (T)(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-
-  if (*x_low >= input_width - 1) {
-    *x_high = *x_low = input_width - 1;
-    x = T(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  T ly = y - *y_low;
-  T lx = x - *x_low;
-  T hy = 1. - ly;
-  T hx = 1. - lx;
-  *w1 = hy * hx;
-  *w2 = hy * lx;
-  *w3 = ly * hx;
-  *w4 = ly * lx;
-}
-
-template <typename T>
-__mlu_func__ void getBilinearInterpolateResult(T *input_ping_nram, const T &w1,
-                                               const T &w2, const T &w3,
-                                               const T &w4,
-                                               const int32_t &deal_num) {
-  /* do bilinear interpolation:
-   *  value = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
-   *     st. v1 = HW[y_low,  x_low]
-   *         v2 = HW[y_low,  x_high]
-   *         v3 = HW[y_high, x_low]
-   *         v4 = HW[y_high, x_high]
-   */
-  T *v1 = input_ping_nram;
-  T *v2 = input_ping_nram + 1 * deal_num;
-  T *v3 = input_ping_nram + 2 * deal_num;
-  T *v4 = input_ping_nram + 3 * deal_num;
-
-  __bang_mul_scalar(v1, v1, w1, deal_num);
-  __bang_fusion(FUSION_FMA, v2, v2, w2, v1, deal_num, deal_num);
-  __bang_fusion(FUSION_FMA, v3, v3, w3, v2, deal_num, deal_num);
-  __bang_fusion(FUSION_FMA, v1, v4, w4, v3, deal_num, deal_num);
-}
-
-template <typename T>
-__mlu_func__ void computeMaxPoolAndArgmaxIdx(int32_t *argmax_idx_nram,
-                                             T *output_nram, T *input_ping_nram,
-                                             const int32_t &pool_idx,
-                                             const int32_t &deal_num) {
-  if (pool_idx == 0) {
-    __bang_move(output_nram, input_ping_nram, deal_num * sizeof(T));
-    return;
-  }
-  int32_t *temp = (int32_t *)input_ping_nram;
-  int32_t *temp1 = temp + deal_num;
-  __bang_lt((T *)temp1, output_nram, input_ping_nram, deal_num);
-
-  // 1. output = max(value, output)
-  __bang_maxequal(output_nram, output_nram, input_ping_nram, deal_num);
-
-  // 2. update argmax_idx
-  //    2.1 argmax_idx *= (output >= value)
-  //    2.2 argmax_idx += pool_idx * (output < value)
-  if (__mluop_is_float<T>()) {
-    __bang_float2int32_rd(temp, (float *)temp1, deal_num, 0);
-  } else {
-    __bang_half2int32_rd(temp, (half *)temp1, deal_num, 0);
-  }
-
-  __bang_not(temp1, temp, deal_num);  // 2.1
-  __bang_mul(argmax_idx_nram, argmax_idx_nram, temp1, deal_num);
-  __bang_mul_scalar(temp, temp, pool_idx, deal_num);  // 2.2
-  __bang_add(argmax_idx_nram, argmax_idx_nram, temp, deal_num);
-}
-
-template <typename T>
-__mlu_func__ void pipeline(T *input_ping_nram, const T *input, T *boxes_nram,
-                           int32_t *argmax_idx_nram, T *base_output,
-                           int32_t *base_argmax_idx, T *output_nram,
-                           const int32_t n, const int32_t c_offset,
-                           const int32_t origin_k, const int32_t origin_h,
-                           const int32_t origin_w, const int32_t origin_c,
-                           const int32_t pool_size, T x, T y, const T x_stride,
-                           const T y_stride, const int32_t border,
-                           const int32_t pingpong_gap, const int32_t deal_num) {
-  // init params of bilinear-interpolate
-  int32_t x_low = 0, x_high = 0;
-  int32_t y_low = 0, y_high = 0;
-  T w1 = 0, w2 = 0, w3 = 0, w4 = 0;
-  bool empty = false;
-  bilinearInterpolate(origin_h, origin_w, x, y, &w1, &w2, &w3, &w4, &x_low,
-                      &x_high, &y_low, &y_high, &empty);
-
-  /*
-   * Pipeline:
-   *   The pipeline is processed in three stages: Load, Compute,
-   *   Store. The allocated memory space of NRAM is divided into
-   *   two parts: PING and Pong. In one time step, PING and PONG
-   *   works on different stream built in chip. For example, while
-   *   PING is loading data from GDRAM, PONG is computing data
-   *   from last time step, or in turn. Both of them are processed
-   *   synchronously until finished.
-   *
-   * diagram of PINGPONG:
-   * |------|-----------------------------------------------------|
-   * |      |                    space                            |
-   * |------|-----------------------------------------------------|
-   * | time |   Ping   |   Pong   |   Ping   |   ...   |   Pong   |
-   * |------|-----------------------------------------------------|
-   * |  0   |    L0    |          |          |         |          |
-   * |  1   |    C0    |    L1    |          |         |          |
-   * |  2   |          |    C1    |    L2    |         |          |
-   * |  3   |          |          |    C2    |   ...   |          |
-   * |  .   |          |          |          |   ...   |          |
-   * |  .   |          |          |          |   ...   |   L_end  |
-   * |  .   |          |          |          |         |   C_end  |
-   * |  .   |          |          |          |         |   S      |
-   * |------|-----------------------------------------------------|
-   */
-
-#define LOAD_INPUT(dst, src, h, w, idx)                              \
-  const int32_t src_offset_##idx =                                   \
-      ((n * origin_h + h) * origin_w + w) * BORDER_NUM * origin_c +  \
-      border * origin_c + c_offset;                                  \
-  __memcpy_async(dst + idx * deal_num_align, src + src_offset_##idx, \
-                 deal_num * sizeof(T), GDRAM2NRAM);
-
-  // L0
-  const int32_t deal_num_align = PAD_UP(deal_num, NFU_ALIGN_SIZE);
-  __bang_write_value(argmax_idx_nram, deal_num_align, (int32_t)0);
-  if (!empty) {
-    LOAD_INPUT((T *)input_ping_nram, (T *)input, y_low, x_low, 0);
-    LOAD_INPUT((T *)input_ping_nram, (T *)input, y_low, x_high, 1);
-    LOAD_INPUT((T *)input_ping_nram, (T *)input, y_high, x_low, 2);
-    LOAD_INPUT((T *)input_ping_nram, (T *)input, y_high, x_high, 3);
-  } else {
-    __memset_nram(input_ping_nram, pingpong_gap, (T)0);
-  }
-  __sync();
-
-  T w1_previous = w1;
-  T w2_previous = w2;
-  T w3_previous = w3;
-  T w4_previous = w4;
-  bool empty_previous = empty;
-
-  x += x_stride;
-  y += y_stride;
-  bilinearInterpolate(origin_h, origin_w, x, y, &w1, &w2, &w3, &w4, &x_low,
-                      &x_high, &y_low, &y_high, &empty);
-
-  // layer 3: loop over range[0, pool_size)
-  for (int32_t i = 0; i < pool_size; ++i) {
-    /**** Load ****/
-    T *input_nram_load = input_ping_nram + int32_t((i + 1) % 2) * pingpong_gap;
-    if (!empty) {
-      LOAD_INPUT((T *)input_nram_load, (T *)input, y_low, x_low, 0);
-      LOAD_INPUT((T *)input_nram_load, (T *)input, y_low, x_high, 1);
-      LOAD_INPUT((T *)input_nram_load, (T *)input, y_high, x_low, 2);
-      LOAD_INPUT((T *)input_nram_load, (T *)input, y_high, x_high, 3);
-    }
-
-    /**** Compute ****/
-    T *input_nram_compute = input_ping_nram + int32_t(i % 2) * pingpong_gap;
-    if (!empty_previous) {
-      // value = 0                             point outside of the box
-      //       = sum(w[j] * v[j]), j=1,2,3,4   otherwise
-      getBilinearInterpolateResult(input_nram_compute, w1_previous, w2_previous,
-                                   w3_previous, w4_previous, deal_num_align);
-    } else {
-      __bang_write_value(input_nram_compute, deal_num_align, (T)0);
-    }
-    computeMaxPoolAndArgmaxIdx(argmax_idx_nram, output_nram, input_nram_compute,
-                               i, deal_num_align);
-    {
-      // update x,y and store previous-value
-      w1_previous = w1;
-      w2_previous = w2;
-      w3_previous = w3;
-      w4_previous = w4;
-      empty_previous = empty;
-
-      x += x_stride;
-      y += y_stride;
-      bilinearInterpolate(origin_h, origin_w, x, y, &w1, &w2, &w3, &w4, &x_low,
-                          &x_high, &y_low, &y_high, &empty);
-    }
-    __sync();
-  }
-
-  // C_end
-  if (!empty_previous) {
-    getBilinearInterpolateResult(
-        input_ping_nram + int32_t((pool_size) % 2) * pingpong_gap, w1_previous,
-        w2_previous, w3_previous, w4_previous, deal_num_align);
-  } else {
-    __bang_write_value(input_ping_nram + int32_t(pool_size % 2) * pingpong_gap,
-                       deal_num_align, (T)0);
-  }
-  computeMaxPoolAndArgmaxIdx(
-      argmax_idx_nram, output_nram,
-      input_ping_nram + int32_t(pool_size % 2) * pingpong_gap, pool_size,
-      deal_num_align);
-
-  // S
-  __memcpy(base_output + c_offset, output_nram, deal_num * sizeof(T),
-           NRAM2GDRAM);  // NOLINT
-  __memcpy(base_argmax_idx + c_offset, argmax_idx_nram,
-           deal_num * sizeof(int32_t), NRAM2GDRAM);  // NOLINT
-}
-
-template <typename T>
-__mlu_global__ void MLUKernelBorderAlignForward(
-    const T *input, const T *boxes, const int32_t pool_size,
-    const int32_t origin_n, const int32_t origin_h, const int32_t origin_w,
-    const int32_t origin_c, const int32_t origin_k, T *output,
-    int32_t *argmax_idx) {
-  // unused MPU
-  if (__is_mpu()) {
-    return;
-  }
-
-  /*
-   * NRAM partition
-   *  |--------------------------------------------------------|
-   *  | Semantics  | NRAM                                      |
-   *  |------------|-------------------------------------------|
-   *  | PING       | input_lt | input_lb | input_rt | input_rb |
-   *  |------------|----------|----------|----------|----------|
-   *  | PONG       | input_lt | input_lb | input_rt | input_rb |
-   *  |------------|----------|----------|----------|----------|
-   *  | Other      | output   |argmax_idx| boxes    |
-   *  |---------------------------------------------|
-   *
-   *  MAX_NRAM_SIZE =
-   *      PING {4 * deal_num * sizeof(T)} +
-   *      PONG {4 * deal_num * sizeof(T)} +
-   *      Other{    deal_num * sizeof(T) +
-   *                deal_num * sizeof(int32_t) + 128byte}
-   */
-  const int32_t pingpong_split_num = 4 + 4;
-  const int32_t deal_num =
-      PAD_DOWN(((MAX_NRAM_SIZE - NFU_ALIGN_SIZE) /
-                (pingpong_split_num * sizeof(T) + sizeof(T) + sizeof(int32_t))),
-               NFU_ALIGN_SIZE);
-  const int32_t pingpong_gap = 4 * deal_num;
-
-  T *input_ping_nram = (T *)nram_buffer;
-  T *output_nram = input_ping_nram + pingpong_split_num * deal_num;
-  T *boxes_nram = output_nram + deal_num;
-  int32_t *argmax_idx_nram = (int32_t *)((char *)boxes_nram + NFU_ALIGN_SIZE);
-
-  /*
-   * input.shape      = [origin_n, origin_h, origin_w, border_num * origin_c]
-   * boxes.shape      = [origin_n, origin_k, coord_num]
-   * output.shape     = [origin_n, origin_k, border_num, origin_c]
-   * argmax_idx.shape = [origin_n, origin_k, border_num, origin_c]
-   * coord_num  = 4;
-   * border_num = 4;
-   *
-   * Partition output:
-   *   Split the num of boxes(origin_n * origin_k) among taskDim, Mulitple
-   *   core load the different part of the output in each loop.
-   *
-   * Calculation process:
-   * |—— layer 0: 0 ~ origin_n * origin_k
-   * |————— layer 1: 0 ~ border_num
-   * |———————— layer 2: 0 ~ origin_c
-   * |——————————— layer 3: 0 ~ pool_size
-   */
-  const int32_t coord_num = 4;
-  const int32_t boxes_num = origin_n * origin_k;
-  const int32_t boxes_num_per_core =
-      boxes_num / taskDim + int32_t((boxes_num % taskDim) > taskId);
-
-  // layer 0: loop over range[0, boxes_num_per_core)
-  for (int32_t i = 0; i < boxes_num_per_core; ++i) {
-    /* load boxes:
-     *     boxes[n,k,0:4] indicates the information on the bottom left
-     *     and top right points: [lb_x, lb_y, rt_x, rt_y]
-     */
-    const int32_t nk_offset = taskId + i * taskDim;
-    __memcpy(boxes_nram, (T *)boxes + nk_offset * coord_num,
-             coord_num * sizeof(T), GDRAM2NRAM);
-    const T box_width = boxes_nram[2] - boxes_nram[0];
-    const T box_height = boxes_nram[3] - boxes_nram[1];
-    T x_stride = 0;
-    T y_stride = 0;
-
-    // layer 1: loop over [0:Top, 1:Left, 2:Bottom, 3:Right]
-    for (int32_t border = 0; border < BORDER_NUM; ++border) {
-      switch (border) {
-        case 0: {  // Top
-          x_stride = box_width / pool_size;
-          y_stride = 0;
-        } break;
-        case 1: {  // Left
-          x_stride = 0;
-          y_stride = box_height / pool_size;
-        } break;
-        case 2: {  // Bottom
-          x_stride = -box_width / pool_size;
-          y_stride = 0;
-        } break;
-        case 3: {  // Right
-          x_stride = 0;
-          y_stride = -box_height / pool_size;
-        } break;
-      }
-      T x = *(boxes_nram + border / 2 * 2);
-      T y = *(boxes_nram + border / 2 * 2 + 1);
-
-      // gdram_ptr of ouput,argmax_idx
-      T *base_output =
-          output + nk_offset * BORDER_NUM * origin_c + border * origin_c;
-      int32_t *base_argmax_idx =
-          argmax_idx + nk_offset * BORDER_NUM * origin_c + border * origin_c;
-
-      // layer 2: loop over range[0, origin_c)
-      const int32_t c_repeat = origin_c / deal_num;
-      const int32_t c_rem = origin_c % deal_num;
-      for (int32_t c_seg_idx = 0; c_seg_idx < c_repeat; ++c_seg_idx) {
-        pipeline<T>(input_ping_nram, input, boxes_nram, argmax_idx_nram,
-                    base_output, base_argmax_idx, output_nram,
-                    nk_offset / origin_k, c_seg_idx * deal_num, origin_k,
-                    origin_h, origin_w, origin_c, pool_size, x, y, x_stride,
-                    y_stride, border, pingpong_gap, deal_num);
-      }
-      if (c_rem != 0) {
-        pipeline<T>(input_ping_nram, input, boxes_nram, argmax_idx_nram,
-                    base_output, base_argmax_idx, output_nram,
-                    nk_offset / origin_k, origin_c - c_rem, origin_k, origin_h,
-                    origin_w, origin_c, pool_size, x, y, x_stride, y_stride,
-                    border, pingpong_gap, c_rem);
-      }
-    }
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelBorderAlignForward(
-    const cnrtDim3_t k_dim, const cnrtFunctionType_t k_type,
-    const cnrtQueue_t queue, mluOpDataType_t data_type, const void *input,
-    const void *boxes, const int32_t pool_size, const int32_t origin_n,
-    const int32_t origin_h, const int32_t origin_w, const int32_t origin_c,
-    const int32_t origin_k, void *output, int32_t *argmax_idx_nram) {
-  // launch kernel
-  if (data_type == mluOpDataType_t::MLUOP_DTYPE_FLOAT) {
-    KERNEL_CHECK(MLUKernelBorderAlignForward<<<k_dim, k_type, queue>>>(
-        (float *)input, (float *)boxes, pool_size, origin_n, origin_h, origin_w,
-        origin_c, origin_k, (float *)output, (int32_t *)argmax_idx_nram));
-  } else {
-    // half
-    KERNEL_CHECK(MLUKernelBorderAlignForward<<<k_dim, k_type, queue>>>(
-        (half *)input, (half *)boxes, pool_size, origin_n, origin_h, origin_w,
-        origin_c, origin_k, (half *)output, (int32_t *)argmax_idx_nram));
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/dcn_backward_data/dcn_backward_data.cpp b/kernels/dcn_backward_data/dcn_backward_data.cpp
deleted file mode 100755
index aa20bb224..000000000
--- a/kernels/dcn_backward_data/dcn_backward_data.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2024] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <limits.h>
-#include <math.h>
-#include <vector>
-
-#include "kernels/utils/cnnl_helper.h"
-
-#define DCNBPDATA_API "mluOpDCNBackwardData"
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetDCNBakcwardDataWorkspaceSize(
-    mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
-    const mluOpTensorDescriptor_t input_desc,
-    const mluOpTensorDescriptor_t offset_desc,
-    const mluOpTensorDescriptor_t mask_desc,
-    const mluOpTensorDescriptor_t filter_desc,
-    const mluOpTensorDescriptor_t grad_output_desc,
-    const mluOpTensorDescriptor_t grad_input_desc,
-    const mluOpTensorDescriptor_t grad_offset_desc,
-    const mluOpTensorDescriptor_t grad_mask_desc, size_t *workspace_size) {
-  PARAM_CHECK(DCNBPDATA_API, handle != NULL);
-  PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL);
-  PARAM_CHECK(DCNBPDATA_API, input_desc != NULL);
-  PARAM_CHECK(DCNBPDATA_API, offset_desc != NULL);
-  PARAM_CHECK(DCNBPDATA_API, filter_desc != NULL);
-  PARAM_CHECK(DCNBPDATA_API, dcn_desc != NULL);
-  PARAM_CHECK(DCNBPDATA_API, grad_output_desc != NULL);
-  PARAM_CHECK(DCNBPDATA_API, grad_input_desc != NULL);
-  PARAM_CHECK(DCNBPDATA_API, grad_offset_desc != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc,
-                                               cnnl_grad_output_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_input_desc,
-                                               cnnl_grad_input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_offset_desc,
-                                               cnnl_grad_offset_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_mask_desc,
-                                               cnnl_grad_mask_desc);
-
-  CHECK_FUNC_RETURN(
-      cnnlGetDCNBakcwardDataWorkspaceSize(
-          cnnl_handle, dcn_desc, cnnl_input_desc, cnnl_offset_desc,
-          cnnl_mask_desc, cnnl_filter_desc, cnnl_grad_output_desc,
-          cnnl_grad_input_desc, cnnl_grad_offset_desc, cnnl_grad_mask_desc,
-          workspace_size),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpGetDCNBakcwardDataWorkspaceSize] Internal error accured in "
-      "cnnlGetDCNBakcwardDataWorkspaceSize.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_output_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_offset_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_mask_desc);
-
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpDCNBackwardData(
-    mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
-    const mluOpTensorDescriptor_t input_desc, const void *input,
-    const mluOpTensorDescriptor_t offset_desc, const void *offset,
-    const mluOpTensorDescriptor_t mask_desc, const void *mask,
-    const mluOpTensorDescriptor_t filter_desc, const void *filter,
-    const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output,
-    void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t grad_input_desc, void *grad_input,
-    const mluOpTensorDescriptor_t grad_offset_desc, void *grad_offset,
-    const mluOpTensorDescriptor_t grad_mask_desc, void *grad_mask) {
-  PARAM_CHECK(DCNBPDATA_API, handle != NULL);
-  if (workspace_size > 0) {
-    PARAM_CHECK(DCNBPDATA_API, workspace != NULL);
-  }
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc,
-                                               cnnl_grad_output_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_input_desc,
-                                               cnnl_grad_input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_offset_desc,
-                                               cnnl_grad_offset_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_mask_desc,
-                                               cnnl_grad_mask_desc);
-  CHECK_FUNC_RETURN(
-      cnnlDCNBackwardData(
-          cnnl_handle, dcn_desc, cnnl_input_desc, input, cnnl_offset_desc,
-          offset, cnnl_mask_desc, mask, cnnl_filter_desc, filter,
-          cnnl_grad_output_desc, grad_output, workspace, workspace_size,
-          cnnl_grad_input_desc, grad_input, cnnl_grad_offset_desc, grad_offset,
-          cnnl_grad_mask_desc, grad_mask),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpDcnBackwardData] Internal error accured in cnnlDCNBackwardData.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_output_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_offset_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_mask_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/dcn_backward_weight/dcn_backward_weight.cpp b/kernels/dcn_backward_weight/dcn_backward_weight.cpp
deleted file mode 100644
index 0f9bcb094..000000000
--- a/kernels/dcn_backward_weight/dcn_backward_weight.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2024] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <limits.h>
-#include <math.h>
-#include <vector>
-
-#include "kernels/utils/cnnl_helper.h"
-
-#define DCNBACKWARDWEIGHT_API "mluOpDCNBackwardWeight"
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetDCNBackwardWeightWorkspaceSize(
-    mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
-    const mluOpTensorDescriptor_t input_desc,
-    const mluOpTensorDescriptor_t offset_desc,
-    const mluOpTensorDescriptor_t mask_desc,
-    const mluOpTensorDescriptor_t grad_output_desc,
-    const mluOpTensorDescriptor_t grad_filter_desc,
-    const mluOpTensorDescriptor_t grad_bias_desc, size_t *size) {
-  PARAM_CHECK("mluOpDCNBackwardWeight", handle != NULL);
-  PARAM_CHECK("mluOpDCNBackwardWeight", dcn_desc != NULL);
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, _handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, _input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, _offset_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, _mask_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc,
-                                               _grad_output_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_filter_desc,
-                                               _grad_filter_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_bias_desc, _grad_bias_desc);
-  CHECK_FUNC_RETURN(
-      cnnlGetDCNBackwardWeightWorkspaceSize(
-          _handle, dcn_desc, _input_desc, _offset_desc, _mask_desc,
-          _grad_output_desc, _grad_filter_desc, _grad_bias_desc, size),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpDCNBackwardWeight] Internal error accured in "
-      "mluOpGetDCNBackwardWeightWorkspaceSize.",  // NOLINT
-      MLUOP_STATUS_INTERNAL_ERROR);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(_offset_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(_mask_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_output_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_filter_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(_grad_bias_desc);
-  DESTROY_CNNL_HANDLE(_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpDCNBackwardWeight(
-    mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
-    const mluOpTensorDescriptor_t input_desc, const void *input,
-    const mluOpTensorDescriptor_t offset_desc, const void *offset,
-    const mluOpTensorDescriptor_t mask_desc, const void *mask,
-    const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output,
-    void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t grad_filter_desc, void *grad_filter,
-    const mluOpTensorDescriptor_t grad_bias_desc, void *grad_bias) {
-  PARAM_CHECK(DCNBACKWARDWEIGHT_API, handle != NULL);
-  if (workspace_size > 0) {
-    PARAM_CHECK(DCNBACKWARDWEIGHT_API, workspace != NULL);
-  }
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_output_desc,
-                                               cnnl_grad_output_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_filter_desc,
-                                               cnnl_grad_filter_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_bias_desc,
-                                               cnnl_grad_bias_desc);
-  CHECK_FUNC_RETURN(
-      cnnlDCNBackwardWeight(cnnl_handle, dcn_desc, cnnl_input_desc, input,
-                            cnnl_offset_desc, offset, cnnl_mask_desc, mask,
-                            cnnl_grad_output_desc, grad_output, workspace,
-                            workspace_size, cnnl_grad_filter_desc, grad_filter,
-                            cnnl_grad_bias_desc, grad_bias),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpDcnBackwardWeight] Internal error accured in "
-      "mluOpDcnBackwardWeight.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_output_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_filter_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grad_bias_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/dcn_forward/dcn_common.h b/kernels/dcn_forward/dcn_common.h
deleted file mode 100644
index 59acab57a..000000000
--- a/kernels/dcn_forward/dcn_common.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_DCN_COMMON_DCN_COMMON_H
-#define KERNELS_DCN_COMMON_DCN_COMMON_H
-#include <limits.h>
-#include <math.h>
-#include <vector>
-
-#include "kernels/utils/cnnl_helper.h"
-
-#define DCN_API "mluOpDCN"
-
-mluOpStatus_t MLUOP_WIN_API
-mluOpCreateDCNDescriptor(mluOpDCNDescriptor_t *dcn_desc) {
-  PARAM_CHECK(DCN_API, dcn_desc != NULL);
-  CHECK_FUNC_RETURN(cnnlCreateDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS,
-                    "[mluOpDcn] Internal error accured in "
-                    "mluOpCreateDCNDescriptor.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API
-mluOpDestroyDCNDescriptor(mluOpDCNDescriptor_t dcn_desc) {
-  PARAM_CHECK(DCN_API, dcn_desc != NULL);
-  CHECK_FUNC_RETURN(cnnlDestroyDCNDescriptor(dcn_desc), CNNL_STATUS_SUCCESS,
-                    "[mluOpDcn] Internal error accured in "
-                    "mluOpDestroyDCNDescriptor.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpSetDCNDescriptor(
-    mluOpDCNDescriptor_t dcn_desc, int dimNb, const int pad[],
-    const int stride[], const int dilation[], int deformable_group,
-    int conv_group, int im2col_step, const mluOpDataType_t compute_type) {
-  PARAM_CHECK(DCN_API, dcn_desc != NULL);
-  CHECK_FUNC_RETURN(
-      cnnlSetDCNDescriptor(dcn_desc, dimNb, pad, stride, dilation,
-                           deformable_group, conv_group, im2col_step,
-                           cnnlDataType_t(compute_type)),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpDcn] Internal error accured in "
-      "mluOpSetDCNDescriptor.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-#endif  // KERNELS_DCN_COMMON_DCN_COMMON_H
diff --git a/kernels/dcn_forward/dcn_forward.cpp b/kernels/dcn_forward/dcn_forward.cpp
deleted file mode 100644
index c746f8971..000000000
--- a/kernels/dcn_forward/dcn_forward.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2024] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/dcn_forward/dcn_common.h"
-
-#define DCNFORWARD_API "mluOpDCNForward"
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetDCNForwardWorkspaceSize(
-    mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
-    const mluOpTensorDescriptor_t input_desc,
-    const mluOpTensorDescriptor_t offset_desc,
-    const mluOpTensorDescriptor_t mask_desc,
-    const mluOpTensorDescriptor_t filter_desc,
-    const mluOpTensorDescriptor_t bias_desc,
-    const mluOpTensorDescriptor_t output_desc, size_t *size) {
-  PARAM_CHECK("mluOpDCNForward", handle != NULL);
-  PARAM_CHECK("mluOpDCNForward", dcn_desc != NULL);
-  PARAM_CHECK("mluOpDCNForward", input_desc != NULL);
-  PARAM_CHECK("mluOpDCNForward", offset_desc != NULL);
-  PARAM_CHECK("mluOpDCNForward", filter_desc != NULL);
-  PARAM_CHECK("mluOpDCNForward", output_desc != NULL);
-  PARAM_CHECK("mluOpDCNForward", size != NULL);
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc);
-  CHECK_FUNC_RETURN(cnnlGetDCNForwardWorkspaceSize(
-                        cnnl_handle, dcn_desc, cnnl_input_desc,
-                        cnnl_offset_desc, cnnl_mask_desc, cnnl_filter_desc,
-                        cnnl_bias_desc, cnnl_output_desc, size),
-                    CNNL_STATUS_SUCCESS,
-                    "[mluOpDCNForward] Internal error accured in "
-                    "mluOpGetDCNForwardWorkspaceSize.",  // NOLINT
-                    MLUOP_STATUS_INTERNAL_ERROR);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API
-mluOpDCNForward(mluOpHandle_t handle, const mluOpDCNDescriptor_t dcn_desc,
-                const mluOpTensorDescriptor_t input_desc, const void *input,
-                const mluOpTensorDescriptor_t offset_desc, const void *offset,
-                const mluOpTensorDescriptor_t mask_desc, const void *mask,
-                const mluOpTensorDescriptor_t filter_desc, const void *filter,
-                const mluOpTensorDescriptor_t bias_desc, const void *bias,
-                void *workspace, size_t workspace_size,
-                const mluOpTensorDescriptor_t output_desc, void *output) {
-  PARAM_CHECK(DCNFORWARD_API, handle != NULL);
-  if (workspace_size > 0) {
-    PARAM_CHECK(DCNFORWARD_API, workspace != NULL);
-  }
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(offset_desc, cnnl_offset_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mask_desc, cnnl_mask_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc);
-  CHECK_FUNC_RETURN(
-      cnnlDCNForward(cnnl_handle, dcn_desc, cnnl_input_desc, input,
-                     cnnl_offset_desc, offset, cnnl_mask_desc, mask,
-                     cnnl_filter_desc, filter, cnnl_bias_desc, bias, workspace,
-                     workspace_size, cnnl_output_desc, output),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpDcnForward] Internal error accured in mluOpDcnForward.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_offset_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mask_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp b/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp
deleted file mode 100644
index 5b316f2f1..000000000
--- a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.cpp
+++ /dev/null
@@ -1,377 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "dynamic_point_to_voxel_backward.h"
-
-#include <algorithm>  // std::min
-#include <string>
-
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"  // mluop::getSizeOfDataType
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-
-static mluOpStatus_t DynamicPointToVoxelBackwardParamCheck(
-    const char *interface_name, const mluOpHandle_t handle,
-    const mluOpReduceMode_t reduce_type,
-    const mluOpTensorDescriptor_t grad_voxel_feats_desc,
-    const void *grad_voxel_feats, const mluOpTensorDescriptor_t feats_desc,
-    const void *feats, const mluOpTensorDescriptor_t voxel_feats_desc,
-    const void *voxel_feats, const mluOpTensorDescriptor_t point2voxel_map_desc,
-    const void *point2voxel_map,
-    const mluOpTensorDescriptor_t voxel_points_count_desc,
-    const void *voxel_points_count,
-    const mluOpTensorDescriptor_t voxel_num_desc, const void *voxel_num,
-    void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t grad_feats_desc, void *grad_feats,
-    bool &zero_element) {
-  // check handle
-  PARAM_CHECK(interface_name, handle != NULL);
-  // platform check
-  if (handle->arch < MLUOP_MLU370) {
-    LOG(ERROR) << interface_name
-               << "Only mlu300 and above devices are supported. "
-               << "Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-  // check desc
-  PARAM_CHECK(interface_name, grad_voxel_feats_desc != NULL);
-  PARAM_CHECK(interface_name, feats_desc != NULL);
-  PARAM_CHECK(interface_name, voxel_feats_desc != NULL);
-  PARAM_CHECK(interface_name, point2voxel_map_desc != NULL);
-  PARAM_CHECK(interface_name, voxel_points_count_desc != NULL);
-  PARAM_CHECK(interface_name, voxel_num_desc != NULL);
-  PARAM_CHECK(interface_name, grad_feats_desc != NULL);
-
-  // check data type
-  PARAM_CHECK(interface_name,
-              grad_voxel_feats_desc->dtype == MLUOP_DTYPE_FLOAT);
-  PARAM_CHECK(interface_name, feats_desc->dtype == MLUOP_DTYPE_FLOAT);
-  PARAM_CHECK(interface_name, voxel_feats_desc->dtype == MLUOP_DTYPE_FLOAT);
-  PARAM_CHECK(interface_name, grad_feats_desc->dtype == MLUOP_DTYPE_FLOAT);
-
-  PARAM_CHECK(interface_name, point2voxel_map_desc->dtype == MLUOP_DTYPE_INT32);
-  PARAM_CHECK(interface_name,
-              voxel_points_count_desc->dtype == MLUOP_DTYPE_INT32);
-  PARAM_CHECK(interface_name, voxel_num_desc->dtype == MLUOP_DTYPE_INT32);
-
-  // check shape
-  PARAM_CHECK(interface_name, grad_voxel_feats_desc->dim == 2);
-  PARAM_CHECK(interface_name, feats_desc->dim == 2);
-  PARAM_CHECK(interface_name, voxel_feats_desc->dim == 2);
-  PARAM_CHECK(interface_name, point2voxel_map_desc->dim == 1);
-  PARAM_CHECK(interface_name, voxel_points_count_desc->dim == 1);
-  PARAM_CHECK(interface_name, voxel_num_desc->dim == 1);
-  PARAM_CHECK(interface_name, grad_feats_desc->dim == 2);
-
-  PARAM_CHECK(interface_name,
-              feats_desc->dims[1] == grad_voxel_feats_desc->dims[1]);
-  PARAM_CHECK(interface_name,
-              voxel_feats_desc->dims[0] == grad_voxel_feats_desc->dims[0]);
-  PARAM_CHECK(interface_name,
-              voxel_feats_desc->dims[1] == grad_voxel_feats_desc->dims[1]);
-  PARAM_CHECK(interface_name,
-              point2voxel_map_desc->dims[0] == feats_desc->dims[0]);
-  PARAM_CHECK(interface_name, voxel_points_count_desc->dims[0] ==
-                                  grad_voxel_feats_desc->dims[0]);
-  PARAM_CHECK(interface_name, voxel_num_desc->dims[0] == 1);
-  PARAM_CHECK(interface_name, grad_feats_desc->dims[0] == feats_desc->dims[0]);
-  PARAM_CHECK(interface_name,
-              grad_feats_desc->dims[1] == grad_voxel_feats_desc->dims[1]);
-  PARAM_CHECK(interface_name,
-              feats_desc->dims[0] >= grad_voxel_feats_desc->dims[0]);
-
-  // param check
-  if (reduce_type != MLUOP_REDUCE_DMAX) {
-    LOG(ERROR) << interface_name
-               << " only supports max reduce in current version. ";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // large tensor
-  const uint64_t grad_voxel_feats_element_num =
-      mluOpGetTensorElementNum(grad_voxel_feats_desc);
-  const uint64_t feats_element_num = mluOpGetTensorElementNum(feats_desc);
-  TENSOR_NUM_CHECK(interface_name, grad_voxel_feats_element_num,
-                   LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(interface_name, feats_element_num, LARGE_TENSOR_NUM, "");
-
-  // kernel size check
-  const int N = feats_desc->dims[0];
-  const int C = feats_desc->dims[1];
-  const size_t dtype_bytes = mluop::getSizeOfDataType(feats_desc->dtype);
-  const size_t idx_dtype_bytes =
-      mluop::getSizeOfDataType(point2voxel_map_desc->dtype);
-  if (N * (idx_dtype_bytes + 1) + C * (2 * dtype_bytes + 3 * idx_dtype_bytes) +
-          idx_dtype_bytes >
-      handle->nram_size) {
-    // float + int
-    LOG(ERROR)
-        << interface_name
-        << " The feats dtype is float, point2voxel_map dtype is int. The feats "
-           "shape is ["
-        << N << ", " << C << "]"
-        << ", should meet constraint : "
-           "5*feats_desc->dims[0]+20*feats_desc->dims[1]+sizeof(int) <= "
-        << handle->nram_size;
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // 0-element check, after dim and shape check
-  if (mluOpGetTensorElementNum(grad_feats_desc) == 0) {
-    zero_element = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if (grad_voxel_feats_element_num != 0) {
-    PARAM_CHECK(interface_name, grad_voxel_feats != NULL);
-  }
-  PARAM_CHECK(interface_name, feats != NULL);
-  if (mluOpGetTensorElementNum(voxel_feats_desc) != 0) {
-    PARAM_CHECK(interface_name, voxel_feats != NULL);
-  }
-  PARAM_CHECK(interface_name, point2voxel_map != NULL);
-  if (mluOpGetTensorElementNum(voxel_points_count_desc) != 0) {
-    PARAM_CHECK(interface_name, voxel_points_count != NULL);
-  }
-  PARAM_CHECK(interface_name, voxel_num != NULL);
-  PARAM_CHECK(interface_name, grad_feats != NULL);
-  if (workspace_size != 0) {
-    PARAM_CHECK(interface_name, workspace != NULL);
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim,
-                       cnrtFunctionType_t *k_type, int N) {
-  int max_core_num = mluop::runtime::getCoreNumOfJobLimitCapability(handle);
-  size_t core_num = handle->core_num_per_cluster;
-  if (N > max_core_num) {
-    k_dim->x = max_core_num;
-    *k_type = mluop::runtime::getJobLimitCapabilityCnrtFuncType(handle);
-  } else {
-    if (N <= 4) {
-      k_dim->x = core_num * 1;
-      *k_type = CNRT_FUNC_TYPE_UNION1;
-    } else if (N <= 8) {
-      k_dim->x = core_num * 2;
-      *k_type = CNRT_FUNC_TYPE_UNION2;
-    } else if (N <= 16) {
-      k_dim->x = core_num * 4;
-      *k_type = CNRT_FUNC_TYPE_UNION4;
-    } else if (N <= 32) {
-      k_dim->x = core_num * 8;
-      *k_type = CNRT_FUNC_TYPE_UNION8;
-    } else if (N <= 64) {
-      k_dim->x = core_num * 16;
-      *k_type = CNRT_FUNC_TYPE_UNION16;
-    } else {
-      LOG(ERROR)
-          << "[mluOpDynamicPointToVoxelBackward]: failed to choose kernel "
-             "to launch";
-      return;
-    }
-  }
-  k_dim->y = 1;
-  k_dim->z = 1;
-  VLOG(5) << "Launch Kernel MLUKernelDynamicPointToVoxelBackward in UNION"
-          << *k_type / 4 << " type";
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpDynamicPointToVoxelBackward(
-    const mluOpHandle_t handle, const mluOpReduceMode_t reduce_type,
-    const mluOpTensorDescriptor_t grad_voxel_feats_desc,
-    const void *grad_voxel_feats, const mluOpTensorDescriptor_t feats_desc,
-    const void *feats, const mluOpTensorDescriptor_t voxel_feats_desc,
-    const void *voxel_feats, const mluOpTensorDescriptor_t point2voxel_map_desc,
-    const void *point2voxel_map,
-    const mluOpTensorDescriptor_t voxel_points_count_desc,
-    const void *voxel_points_count,
-    const mluOpTensorDescriptor_t voxel_num_desc, const void *voxel_num,
-    void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t grad_feats_desc, void *grad_feats) {
-  const char *interface_name = "[mluOpDynamicPointToVoxelBackward]";
-  bool zero_element = false;
-  mluOpStatus_t param_check = DynamicPointToVoxelBackwardParamCheck(
-      interface_name, handle, reduce_type, grad_voxel_feats_desc,
-      grad_voxel_feats, feats_desc, feats, voxel_feats_desc, voxel_feats,
-      point2voxel_map_desc, point2voxel_map, voxel_points_count_desc,
-      voxel_points_count, voxel_num_desc, voxel_num, workspace, workspace_size,
-      grad_feats_desc, grad_feats, zero_element);
-  if (param_check != MLUOP_STATUS_SUCCESS) {
-    return param_check;
-  }
-  if (zero_element) {
-    VLOG(5) << interface_name << " Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // generator
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("dynamic_point_to_voxel_backward");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA_REAL(true, "grad_voxel_feats", grad_voxel_feats,
-                       grad_voxel_feats_desc);
-    GEN_CASE_DATA_REAL(true, "feats", feats, feats_desc);
-    GEN_CASE_DATA_REAL(true, "voxel_feats", voxel_feats, voxel_feats_desc);
-    GEN_CASE_DATA_REAL(true, "point2voxel_map", point2voxel_map,
-                       point2voxel_map_desc);
-    GEN_CASE_DATA_REAL(true, "voxel_points_count", voxel_points_count,
-                       voxel_points_count_desc);
-    GEN_CASE_DATA_REAL(true, "voxel_num", voxel_num, voxel_num_desc);
-    GEN_CASE_DATA(false, "grad_feats", grad_feats, grad_feats_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "dynamic_point_to_voxel_backward",
-                             "reduce_type", reduce_type);
-    GEN_CASE_TEST_PARAM_NEW(false, false, true, 0.003, 0.003, 0);
-  }
-
-  const int N = feats_desc->dims[0];
-  const int C = feats_desc->dims[1];
-  const auto grad_voxel_feats_element_num =
-      mluOpGetTensorElementNum(grad_voxel_feats_desc);
-  const auto grad_feats_element_num = mluOpGetTensorElementNum(grad_feats_desc);
-  VLOG(5) << interface_name << " N = " << N << ", C = " << C
-          << ", grad_voxel_feats_element_num=" << grad_voxel_feats_element_num
-          << ", grad_feats_element_num=" << grad_feats_element_num;
-  // 1. init output
-  uint64_t fill_0 = 0x0;
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_feats_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_0,
-                          cnnl_output_desc, grad_feats));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(handle, &k_dim, &k_type, N);
-  if (grad_voxel_feats_element_num != 0) {
-    // 2. init workspace
-    mluOpTensorDescriptor_t indices_desc;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&indices_desc));
-    int indices_dims[2] = {(int)grad_voxel_feats_element_num, 1};
-    INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS ==
-                                       mluOpSetTensorDescriptor(
-                                           indices_desc, MLUOP_LAYOUT_ARRAY,
-                                           MLUOP_DTYPE_INT32, 2, indices_dims));
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(indices_desc,
-                                                   cnnl_output_desc);
-      CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST,
-                            &grad_feats_element_num, cnnl_output_desc,
-                            workspace));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    // 3. get scatter indices
-    CHECK_RETURN("[mluOpDynamicPointToVoxelBackward]",
-                 KernelDynamicPointToVoxelBackward(
-                     k_dim, k_type, handle->queue, feats, voxel_feats,
-                     grad_feats, workspace, point2voxel_map, voxel_num, N, C));
-    // 4. scatter
-    cnnlScatterNdMode_t scatter_mode = CNNL_SCATTERND_ADD;
-    mluOpTensorDescriptor_t updates_desc;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&updates_desc));
-    int updates_dims[1] = {(int)grad_voxel_feats_element_num};
-    INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS ==
-                                       mluOpSetTensorDescriptor(
-                                           updates_desc, MLUOP_LAYOUT_ARRAY,
-                                           MLUOP_DTYPE_FLOAT, 1, updates_dims));
-    mluOpTensorDescriptor_t output_desc;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&output_desc));
-    int output_dims[1] = {(int)grad_feats_element_num};
-    INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS ==
-                                       mluOpSetTensorDescriptor(
-                                           output_desc, MLUOP_LAYOUT_ARRAY,
-                                           MLUOP_DTYPE_FLOAT, 1, output_dims));
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(indices_desc,
-                                                   cnnl_indices_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(updates_desc,
-                                                   cnnl_updates_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc,
-                                                   cnnl_output_desc);
-
-      CALL_CNNL(cnnlScatterNd_v2(cnnl_handle, scatter_mode, cnnl_indices_desc,
-                                 workspace, cnnl_updates_desc, grad_voxel_feats,
-                                 NULL, NULL, cnnl_output_desc, grad_feats));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_updates_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(updates_desc));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(output_desc));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(indices_desc));
-  }
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetDynamicPointToVoxelBackwardWorkspaceSize(
-    const mluOpHandle_t handle, const mluOpReduceMode_t reduce_type,
-    const mluOpTensorDescriptor_t grad_voxel_feats_desc,
-    const mluOpTensorDescriptor_t feats_desc,
-    const mluOpTensorDescriptor_t voxel_feats_desc,
-    const mluOpTensorDescriptor_t point2voxel_map_desc,
-    const mluOpTensorDescriptor_t voxel_points_count_desc,
-    const mluOpTensorDescriptor_t voxel_num_desc, size_t *workspace_size) {
-  const char *interface_name =
-      "[mluOpGetDynamicPointToVoxelBackwardWorkspaceSize]";
-  PARAM_CHECK(interface_name, handle != NULL);
-  if (handle->arch < MLUOP_MLU370) {
-    LOG(ERROR) << interface_name
-               << "Only mlu300 and above devices are supported. "
-               << "Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-  PARAM_CHECK(interface_name, grad_voxel_feats_desc != NULL);
-  PARAM_CHECK(interface_name, feats_desc != NULL);
-  PARAM_CHECK(interface_name, voxel_feats_desc != NULL);
-  PARAM_CHECK(interface_name, point2voxel_map_desc != NULL);
-  PARAM_CHECK(interface_name, voxel_points_count_desc != NULL);
-  PARAM_CHECK(interface_name, voxel_num_desc != NULL);
-  PARAM_CHECK(interface_name, workspace_size != NULL);
-  const int N = feats_desc->dims[0];
-  const int C = feats_desc->dims[1];
-  *workspace_size = N * C * sizeof(int);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h b/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h
deleted file mode 100644
index 2a4df91b4..000000000
--- a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_DYNAMIC_POINT_TO_VOXEL_BACKWARD_\
-DYNAMIC_POINT_TO_VOXEL_BACKWARD_H
-#define KERNELS_DYNAMIC_POINT_TO_VOXEL_BACKWARD_\
-DYNAMIC_POINT_TO_VOXEL_BACKWARD_H
-
-#include "mlu_op.h"
-
-mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *feats, const void *voxel_feats, void *grad_feats,
-    void *voxel_from, const void *point2voxel_map, const void *voxel_num,
-    const int N, const int C);
-
-#endif  // KERNELS_DYNAMIC_POINT_TO_VOXEL_BACKWARD_
-        // DYNAMIC_POINT_TO_VOXEL_FORWARD_H
diff --git a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu b/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu
deleted file mode 100644
index 9e7d137b9..000000000
--- a/kernels/dynamic_point_to_voxel_backward/dynamic_point_to_voxel_backward_union1.mlu
+++ /dev/null
@@ -1,201 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "dynamic_point_to_voxel_backward.h"
-
-#include "core/logging.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void loadAsync(T *feats_nram, T *voxel_feats_nram,
-                            int *index_mask_nram, int *voxel_from_nram,
-                            int *point2voxel_map_real_nram,
-                            const int *point2voxel_map_nram,
-                            const int *index_col_nram, const T *feats,
-                            const T *voxel_feats, const int *voxel_from, int &x,
-                            int &n_real, const int n_limit, const int N,
-                            const int C) {
-  int invalid_index = -1;
-  int size_feats = C * sizeof(T);
-  int size_feats_idx = C * sizeof(int);
-  n_real = 0;
-  for (; x < N && n_real < n_limit; x++) {
-    int point_to = point2voxel_map_nram[x];
-    int input_offset = x * C;
-    int input_real_offset = n_real * C;
-    if (taskId == point_to % taskDim) {
-      if (point_to == invalid_index) {
-        continue;
-      }
-      int reduced_offset = point_to * C;
-      // load valid data to feats_nram
-      __memcpy_async(feats_nram + input_real_offset, feats + input_offset,
-                     size_feats, GDRAM2NRAM);
-      // boradcast voxel_feats data to voxel_feats_nram via the same "point_to"
-      __memcpy_async(voxel_feats_nram + input_real_offset,
-                     voxel_feats + reduced_offset, size_feats, GDRAM2NRAM);
-      // boradcast voxel_from data to voxel_from_nram via the same "point_to"
-      __memcpy_async(voxel_from_nram + input_real_offset,
-                     voxel_from + reduced_offset, size_feats_idx, GDRAM2NRAM);
-      // record valid index of x in index_mask_nram
-      __bang_write_value(index_mask_nram + input_real_offset, C, x * C);
-      // point2voxel_map removed invalid data
-      point2voxel_map_real_nram[n_real] = point_to;
-      ++n_real;
-    }
-  }
-  if (n_real > 0) {
-    __bang_cycle_add(index_mask_nram, index_mask_nram, index_col_nram,
-                     n_real * C, C);
-  }
-}
-
-template <typename T>
-__mlu_func__ void compute(T *feats_nram, T *voxel_feats_nram,
-                          int *index_mask_nram, int *voxel_from_nram,
-                          const int n_real, const int N, const int C) {
-  if (n_real > 0) {
-    // view [n_real, C] as [n_real * C]
-    int deal_num = n_real * C;
-    // if (feats[i] == voxel_feats[i]) {mask[i] = 1} else {mask[i] = 0}
-    __bang_eq(feats_nram, voxel_feats_nram, feats_nram, deal_num);
-    // change mask1's dtype to int32
-    __bang_float2int32_tz((int *)feats_nram, feats_nram, deal_num, 0);
-    // mask2 = NOT mask1
-    __bang_not((int *)voxel_feats_nram, (int *)feats_nram, deal_num);
-    // choose index of "feats[i] == voxel_feats[i]"
-    __bang_mul((int *)feats_nram, (int *)feats_nram, index_mask_nram, deal_num);
-    // mask2 *= N * C
-    __bang_mul_scalar((int *)voxel_feats_nram, (int *)voxel_feats_nram, N * C,
-                      deal_num);
-    // mix choosed index and 'N * C'
-    __bang_add(index_mask_nram, (int *)voxel_feats_nram, (int *)feats_nram,
-               deal_num);
-    // choose the min index
-    __bang_minequal(voxel_from_nram, voxel_from_nram, index_mask_nram,
-                    deal_num);
-  }
-}
-
-__mlu_func__ void storeAsync(int *voxel_from, const int *voxel_from_nram,
-                             const int *point2voxel_map_real_nram,
-                             bool *voxel_from_flag_nram, int *index_mask_nram,
-                             const int n_real, const int N, const int C) {
-  int size_feats_idx = C * sizeof(int);
-  for (int i = 0; i < n_real; i++) {
-    int offset_real = point2voxel_map_real_nram[i];
-    // 1) use atomicmin, too slow
-    // __bang_atomic_reduce_min(voxel_from + offset_real * C,
-    //                          voxel_from_nram + i * C, C);
-    // 2) compare one by one, use voxel_from_flag_nram as flags to record
-    // whether dst idx has appeard
-    if (voxel_from_flag_nram[offset_real] == false) {
-      // if number of grad idx on offset_real == 1, use the idx value directly
-      __memcpy_async(voxel_from + offset_real * C, voxel_from_nram + i * C,
-                     size_feats_idx, NRAM2GDRAM);
-      // set voxel_from_flag to true
-      voxel_from_flag_nram[offset_real] = true;
-    } else {
-      __sync_io();
-      // load the idx appeard
-      __memcpy(index_mask_nram, voxel_from + offset_real * C, size_feats_idx,
-               GDRAM2NRAM);
-      // if number of grad idx on offset_real > 1, pick the min idx value
-      __bang_minequal(index_mask_nram, index_mask_nram, voxel_from_nram + i * C,
-                      C);
-      // store the new idx
-      __memcpy(voxel_from + offset_real * C, index_mask_nram, size_feats_idx,
-               NRAM2GDRAM);
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUKernelMaxReduceTracebackScatterIdx(
-    const T *feats, const T *voxel_feats, T *grad_feats, int *voxel_from,
-    const int *point2voxel_map, const int *voxel_num, const int N,
-    const int C) {
-  const int M = *voxel_num;
-  if (M == 0) {
-    return;
-  }
-  int size_input = N * sizeof(int);
-  int size_reduced_flag = M * sizeof(bool);
-  int size_feats = C * sizeof(T);
-  int size_feats_idx = C * sizeof(int);
-
-  int nram_size = MAX_NRAM_SIZE;
-  int n_limit = (nram_size - size_input - size_reduced_flag - size_feats_idx) /
-                (2 * size_feats + 2 * size_feats_idx + sizeof(int));
-  int feats_limit = n_limit * C;
-
-  T *feats_nram = (T *)nram_buffer;                // [n_limit, C]
-  T *voxel_feats_nram = feats_nram + feats_limit;  // [n_limit, C]
-  int *index_mask_nram =
-      (int *)(voxel_feats_nram + feats_limit);                // [n_limit, C]
-  int *voxel_from_nram = index_mask_nram + feats_limit;       // [n_limit, C]
-  int *point2voxel_map_nram = voxel_from_nram + feats_limit;  // [N]
-  int *point2voxel_map_real_nram = point2voxel_map_nram + N;  // [n_limit]
-  bool *voxel_from_flag_nram =
-      (bool *)(point2voxel_map_real_nram + n_limit);        // [M]
-  int *index_col_nram = (int *)(voxel_from_flag_nram + M);  // [C]
-
-  __sync_all();
-
-  // broadcast point2voxel_map to nram
-  __memcpy(point2voxel_map_nram, point2voxel_map, size_input, GDRAM2NRAM);
-  // initialze voxel_from_flag to false
-  __memset_nram(voxel_from_flag_nram, M, (char)false);
-  for (int i = 0; i < C; i++) {
-    index_col_nram[i] = i;
-  }
-  for (int x = 0, n_real = 0; x < N;) {
-    // load data, get x and n_real
-    loadAsync(feats_nram, voxel_feats_nram, index_mask_nram, voxel_from_nram,
-              point2voxel_map_real_nram, point2voxel_map_nram, index_col_nram,
-              feats, voxel_feats, voxel_from, x, n_real, n_limit, N, C);
-    __sync();
-    // compute
-    compute(feats_nram, voxel_feats_nram, index_mask_nram, voxel_from_nram,
-            n_real, N, C);
-    // store
-    storeAsync(voxel_from, voxel_from_nram, point2voxel_map_real_nram,
-               voxel_from_flag_nram, index_mask_nram, n_real, N, C);
-    __sync();
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *feats, const void *voxel_feats, void *grad_feats,
-    void *voxel_from, const void *point2voxel_map, const void *voxel_num,
-    const int N, const int C) {
-  KERNEL_CHECK(MLUKernelMaxReduceTracebackScatterIdx<<<k_dim, k_type, queue>>>(
-      (const float *)feats, (const float *)voxel_feats, (float *)grad_feats,
-      (int *)voxel_from, (const int *)point2voxel_map, (const int *)voxel_num,
-      N, C));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp b/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp
deleted file mode 100644
index 1a42bcc1d..000000000
--- a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.cpp
+++ /dev/null
@@ -1,337 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "dynamic_point_to_voxel_forward.h"
-
-#include <string>
-
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-
-// policy function
-static void policyFuncDynamicPointToVoxelForward(const mluOpHandle_t handle,
-                                                 cnrtDim3_t *k_dim,
-                                                 cnrtFunctionType_t *k_type,
-                                                 const int nums) {
-  int max_core_num = mluop::runtime::getCoreNumOfJobLimitCapability(handle);
-  size_t core_num = handle->core_num_per_cluster;
-  if (nums > max_core_num) {
-    k_dim->x = max_core_num;
-    *k_type = mluop::runtime::getJobLimitCapabilityCnrtFuncType(handle);
-  } else {
-    if (nums == 1) {
-      k_dim->x = 1;
-      *k_type = CNRT_FUNC_TYPE_BLOCK;
-    } else if (nums <= 4) {
-      k_dim->x = core_num * 1;
-      *k_type = CNRT_FUNC_TYPE_UNION1;
-    } else if (nums <= 8) {
-      k_dim->x = core_num * 2;
-      *k_type = CNRT_FUNC_TYPE_UNION2;
-    } else if (nums <= 16) {
-      k_dim->x = core_num * 4;
-      *k_type = CNRT_FUNC_TYPE_UNION4;
-    } else if (nums <= 32) {
-      k_dim->x = core_num * 8;
-      *k_type = CNRT_FUNC_TYPE_UNION8;
-    } else if (nums <= 64) {
-      k_dim->x = core_num * 16;
-      *k_type = CNRT_FUNC_TYPE_UNION16;
-    }
-  }
-  k_dim->y = 1;
-  k_dim->z = 1;
-  return;
-}
-
-static mluOpStatus_t DynamicPointToVoxelForwardParamCheck(
-    const std::string &api, const mluOpHandle_t handle,
-    const mluOpReduceMode_t reduce_type, const void *feats, const void *coors,
-    const void *voxel_feats, const void *voxel_coors,
-    const void *point2voxel_map, const void *voxel_points_count,
-    const void *voxel_num, void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t feats_desc,
-    const mluOpTensorDescriptor_t coors_desc,
-    const mluOpTensorDescriptor_t voxel_feats_desc,
-    const mluOpTensorDescriptor_t voxel_coors_desc,
-    const mluOpTensorDescriptor_t point2voxel_map_desc,
-    const mluOpTensorDescriptor_t voxel_points_count_desc,
-    const mluOpTensorDescriptor_t voxel_num_desc, bool *zero_element) {
-  // check descriptor
-  PARAM_CHECK(api, handle != NULL);
-  // platform check
-  if (handle->arch < MLUOP_MLU370) {
-    LOG(ERROR) << api << "Only mlu300 and above devices are supported. "
-               << "Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-  PARAM_CHECK(api, feats_desc != NULL);
-  PARAM_CHECK(api, coors_desc != NULL);
-  PARAM_CHECK(api, voxel_feats_desc != NULL);
-  PARAM_CHECK(api, voxel_coors_desc != NULL);
-  PARAM_CHECK(api, point2voxel_map_desc != NULL);
-  PARAM_CHECK(api, voxel_points_count_desc != NULL);
-  PARAM_CHECK(api, voxel_num_desc != NULL);
-  // check shape
-  PARAM_CHECK(api, feats_desc->dim == 2);
-  PARAM_CHECK(api, coors_desc->dim == 2);
-  PARAM_CHECK(api, voxel_feats_desc->dim == 2);
-  PARAM_CHECK(api, voxel_coors_desc->dim == 2);
-  PARAM_CHECK(api, point2voxel_map_desc->dim == 1);
-  PARAM_CHECK(api, voxel_points_count_desc->dim == 1);
-  PARAM_CHECK(api, voxel_num_desc->dim == 1);
-
-  // check data type
-  PARAM_CHECK_V2(api, (feats_desc->dtype == MLUOP_DTYPE_FLOAT),
-                 "Only float are supported in feats tensor, but the data "
-                 "type of tensor is "
-                     << mluOpGetNameOfDataType(feats_desc->dtype) << ".");
-  PARAM_CHECK_V2(api, (coors_desc->dtype == MLUOP_DTYPE_INT32),
-                 "Only int32 are supported in coors tensor, but the data "
-                 "type of tensor is "
-                     << mluOpGetNameOfDataType(coors_desc->dtype) << ".");
-  PARAM_CHECK_V2(
-      api, (point2voxel_map_desc->dtype == MLUOP_DTYPE_INT32),
-      "Only int32 are supported in point2voxel_map tensor, but the data "
-      "type of tensor is "
-          << mluOpGetNameOfDataType(point2voxel_map_desc->dtype) << ".");
-
-  PARAM_CHECK(api, voxel_feats_desc->dtype == feats_desc->dtype);
-  PARAM_CHECK(api, voxel_coors_desc->dtype == coors_desc->dtype);
-  PARAM_CHECK(api,
-              voxel_points_count_desc->dtype == point2voxel_map_desc->dtype);
-  PARAM_CHECK(api, voxel_num_desc->dtype == point2voxel_map_desc->dtype);
-
-  if (reduce_type != MLUOP_REDUCE_DMAX && reduce_type != MLUOP_REDUCE_DMEAN) {
-    LOG(ERROR) << api << "Only support max and mean. "
-               << "Please check reduce_type!";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // check dim
-  PARAM_CHECK(api, feats_desc->dims[0] == coors_desc->dims[0]);
-  PARAM_CHECK(api, feats_desc->dims[0] == point2voxel_map_desc->dims[0]);
-  PARAM_CHECK(api, voxel_feats_desc->dims[0] == voxel_coors_desc->dims[0]);
-  PARAM_CHECK(api,
-              voxel_feats_desc->dims[0] == voxel_points_count_desc->dims[0]);
-  PARAM_CHECK(api, voxel_num_desc->dims[0] == 1);
-  PARAM_CHECK(api, feats_desc->dims[1] == voxel_feats_desc->dims[1]);
-  PARAM_CHECK(api, coors_desc->dims[1] == voxel_coors_desc->dims[1]);
-  PARAM_CHECK(api, coors_desc->dims[1] == 3);
-  PARAM_CHECK(api, feats_desc->dims[0] >= voxel_feats_desc->dims[0]);
-
-  // check large tensor
-  const size_t feats_element_num = mluOpGetTensorElementNum(feats_desc);
-  const size_t coors_element_num = mluOpGetTensorElementNum(coors_desc);
-  TENSOR_NUM_CHECK(api, feats_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(api, coors_element_num, LARGE_TENSOR_NUM, "");
-
-  // check element num zero
-  if (feats_element_num == 0 || coors_element_num == 0) {
-    *zero_element = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // check workspace ptr
-  if (workspace_size > 0) {
-    PARAM_CHECK(api, workspace != NULL);
-  }
-  // input and output ptr check null
-  PARAM_CHECK(api, feats != NULL);
-  PARAM_CHECK(api, coors != NULL);
-  PARAM_CHECK(api, voxel_feats != NULL);
-  PARAM_CHECK(api, voxel_coors != NULL);
-  PARAM_CHECK(api, point2voxel_map != NULL);
-  PARAM_CHECK(api, voxel_points_count != NULL);
-  PARAM_CHECK(api, voxel_num != NULL);
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetDynamicPointToVoxelForwardWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t feats_desc,
-    const mluOpTensorDescriptor_t coors_desc, size_t *workspace_size) {
-  const std::string api = "[mluOpGetDynamicPointToVoxelForwardWorkspaceSize]";
-  PARAM_CHECK(api, handle != NULL);
-  // platform check
-  if (handle->arch < MLUOP_MLU370) {
-    LOG(ERROR) << "[mluOpGetDynamicPointToVoxelForwardWorkspaceSize] Only "
-                  "mlu300 and above "
-                  "devices are supported. "
-               << "Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-
-  PARAM_CHECK(api, feats_desc != NULL);
-  PARAM_CHECK(api, coors_desc != NULL);
-  PARAM_CHECK(api, workspace_size != NULL);
-
-  {
-    cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND;
-    cnnlUniqueDescriptor_t unique_desc;
-
-    CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc));
-    CALL_CNNL(cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true));
-
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(coors_desc, cnnl_input_desc);
-    CALL_CNNL(cnnlGetUniqueWorkspaceSize(cnnl_handle, unique_desc,
-                                         cnnl_input_desc, workspace_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-
-    CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc));
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpDynamicPointToVoxelForward(
-    const mluOpHandle_t handle, const mluOpReduceMode_t reduce_type,
-    const mluOpTensorDescriptor_t feats_desc, const void *feats,
-    const mluOpTensorDescriptor_t coors_desc, void *coors, void *workspace,
-    const size_t workspace_size, const mluOpTensorDescriptor_t voxel_feats_desc,
-    void *voxel_feats, const mluOpTensorDescriptor_t voxel_coors_desc,
-    void *voxel_coors, const mluOpTensorDescriptor_t point2voxel_map_desc,
-    void *point2voxel_map,
-    const mluOpTensorDescriptor_t voxel_points_count_desc,
-    void *voxel_points_count, const mluOpTensorDescriptor_t voxel_num_desc,
-    void *voxel_num) {
-  const std::string api = "[mluOpDynamicPointToVoxelForward]";
-  // check params
-  bool zero_element = false;
-
-  mluOpStatus_t ret = DynamicPointToVoxelForwardParamCheck(
-      api, handle, reduce_type, feats, coors, voxel_feats, voxel_coors,
-      point2voxel_map, voxel_points_count, voxel_num, workspace, workspace_size,
-      feats_desc, coors_desc, voxel_feats_desc, voxel_coors_desc,
-      point2voxel_map_desc, voxel_points_count_desc, voxel_num_desc,
-      &zero_element);
-
-  if (ret != MLUOP_STATUS_SUCCESS) {
-    LOG(ERROR) << api
-               << " Error found during element verification, please check.";
-    return ret;
-  }
-  // check zero element
-  if (zero_element) {
-    VLOG(5) << "[mluOpDynamicPointToVoxelForward] Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-  // generator
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("dynamic_point_to_voxel_forward");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA(true, "feats", feats, feats_desc, -100, 100);
-    GEN_CASE_DATA_REAL(true, "coors", coors, coors_desc);
-    GEN_CASE_DATA(false, "voxel_feats", voxel_feats, voxel_feats_desc, 0, 0);
-    GEN_CASE_DATA(false, "voxel_coors", voxel_coors, voxel_coors_desc, 0, 0);
-    GEN_CASE_DATA(false, "point2voxel_map", point2voxel_map,
-                  point2voxel_map_desc, 0, 0);
-    GEN_CASE_DATA(false, "voxel_points_count", voxel_points_count,
-                  voxel_points_count_desc, 0, 0);
-    GEN_CASE_DATA(false, "voxel_num", voxel_num, voxel_num_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "dynamic_point_to_voxel_forward", "reduce_type",
-                             reduce_type);
-    GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-  }
-
-  const int num_points = feats_desc->dims[0];
-  const int num_feats = feats_desc->dims[1];
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFuncDynamicPointToVoxelForward(handle, &k_dim, &k_type, num_points);
-  VLOG(5) << api << " Launch [" << k_type << ", " << k_dim.x << ", " << k_dim.y
-          << ", " << k_dim.z << "].";
-  // 1. mask_fill coors
-  VLOG(5) << api << " launch KernelMaskFillCoorsForward start.";
-  CHECK_RETURN("[MaskFillCoorsForward]",
-               KernelMaskFillCoorsForward(k_dim, k_type, handle->queue,
-                                          num_points, coors));
-  VLOG(5) << api << " launch KernelMaskFillCoorsForward end.";
-
-  // 2. unique op
-  {
-    cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND;
-    cnnlUniqueDescriptor_t unique_desc;
-
-    CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc));
-    CALL_CNNL(cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, true, true));
-
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(coors_desc, cnnl_input_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_coors_desc,
-                                                 cnnl_output_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(point2voxel_map_desc,
-                                                 cnnl_indices_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_points_count_desc,
-                                                 cnnl_counts_desc);
-
-    CALL_CNNL(cnnlUnique_v2(cnnl_handle, unique_desc, cnnl_input_desc, coors,
-                            workspace, workspace_size, (int *)voxel_num,
-                            cnnl_output_desc, voxel_coors, cnnl_indices_desc,
-                            point2voxel_map, cnnl_counts_desc,
-                            voxel_points_count));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_counts_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-
-    CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc));
-  }
-
-  // 3. reduce
-  // fill -inf or zero
-  VLOG(5) << "cnnlFill_v3 min value start.";
-  float inf_value = 0x0;
-  if (reduce_type == MLUOP_REDUCE_DMAX) {
-    inf_value = -INFINITY;
-  }
-  const float fill_value = inf_value;
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(voxel_feats_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, voxel_feats));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  VLOG(5) << "cnnlFill_v3 min value end.";
-
-  VLOG(5) << api << " launch KernelDynamicPointToVoxelForward start.";
-  CHECK_RETURN("[mluOpDynamicPointToVoxelForward]",
-               KernelDynamicPointToVoxelForward(
-                   k_dim, k_type, handle->queue, reduce_type, feats, num_points,
-                   num_feats, voxel_coors, voxel_num, point2voxel_map,
-                   voxel_points_count, voxel_feats));
-  VLOG(5) << api << " launch KernelDynamicPointToVoxelForward end.";
-
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.h b/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.h
deleted file mode 100644
index a2a64c866..000000000
--- a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_DYNAMIC_POINT_TO_VOXEL_FORWARD_DYNAMIC_POINT_TO_VOXEL_FORWARD_H
-#define KERNELS_DYNAMIC_POINT_TO_VOXEL_FORWARD_DYNAMIC_POINT_TO_VOXEL_FORWARD_H
-
-#include "mlu_op.h"
-
-mluOpStatus_t MLUOP_WIN_API
-KernelMaskFillCoorsForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                           cnrtQueue_t queue, int32_t num_points, void *coors);
-
-mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpReduceMode_t reduce_mode, const void *feats, int32_t num_points,
-    int32_t num_voxel, void *voxel_coors, void *voxel_num,
-    void *point2voxel_map, void *voxel_points_count, void *voxel_feats);
-
-#endif  // KERNELS_DYNAMIC_POINT_TO_VOXEL_FORWARD_DYNAMIC_
-        // POINT_TO_VOXEL_FORWARD_H
diff --git a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu b/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu
deleted file mode 100644
index b0c7d8711..000000000
--- a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_forward_union1.mlu
+++ /dev/null
@@ -1,338 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "dynamic_point_to_voxel_forward.h"
-
-#include "core/logging.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-#define COORS_IDX 1
-#define COORS_XYZ 3
-
-__mlu_func__ void load(const float *input_addr, float *nram_input,
-                       const int deal_num, const int pi) {
-  int offset = (pi % 2) * 2 * deal_num;
-  float *nram_input_p = nram_input + offset;
-  __memcpy_async(nram_input_p, input_addr, deal_num * sizeof(float),
-                 GDRAM2NRAM);
-}
-
-__mlu_func__ void compute(float *nram_input, int *nram_points_count,
-                          const int deal_num, const int pi) {
-  int offset = (pi % 2) * 2 * deal_num;
-  float *nram_input_p = nram_input + offset;
-  float *nram_output_p = nram_input + offset + deal_num;
-#if (__BANG_ARCH__ >= 322) && (__BANG_ARCH__ != 372)
-  __bang_div(nram_output_p, nram_input_p, (float)(nram_points_count[pi]),
-             deal_num);
-#else
-  __bang_mul_scalar(nram_output_p, nram_input_p,
-                    1.0 / (float)nram_points_count[pi], deal_num);
-#endif
-}
-
-__mlu_func__ void store(float *output_addr, float *nram_output,
-                        const int deal_num, const int pi) {
-  int offset = (pi % 2) * 2 * deal_num;
-  float *nram_output_p = nram_output + offset + deal_num;
-  __memcpy_async(output_addr, nram_output_p, deal_num * sizeof(float),
-                 NRAM2GDRAM);
-}
-
-__mlu_func__ void lcsFunc(float *base_input_addr, int *base_points_count,
-                          float *nram_input, const int repeat_num,
-                          const int rem_num, const int deal_h) {
-  float *input_addr = NULL;
-  float *output_addr = NULL;
-  if (repeat_num > 0) {
-    input_addr = base_input_addr;
-    load(input_addr, nram_input, deal_h, 0);
-    __sync();
-  }
-
-  if (repeat_num > 1) {
-    // L(vi=1)
-    input_addr = base_input_addr + deal_h;
-    load(input_addr, nram_input, deal_h, 1);
-    // C(vi=0)
-    compute(nram_input, base_points_count, deal_h, 0);
-    __sync();
-  }
-
-  for (int v_iter = 0; v_iter < repeat_num - 2; v_iter++) {
-    // S(vi)
-    output_addr = base_input_addr + v_iter * deal_h;
-    store(output_addr, nram_input, deal_h, v_iter);
-    // C(vi+1)
-    compute(nram_input, base_points_count, deal_h, v_iter + 1);
-    // L(vi+2)
-    input_addr = base_input_addr + (v_iter + 2) * deal_h;
-    load(input_addr, nram_input, deal_h, v_iter + 2);
-    __sync_io_move_compute();
-  }
-
-  if (repeat_num > 1) {
-    // S(vi = repeat_num - 2)
-    output_addr = base_input_addr + (repeat_num - 2) * deal_h;
-    store(output_addr, nram_input, deal_h, repeat_num - 2);
-  }
-  if (rem_num > 0) {
-    // L[repeat_num]
-    input_addr = base_input_addr + repeat_num * deal_h;
-    load(input_addr, nram_input, rem_num, repeat_num);
-  }
-  if (repeat_num > 0) {
-    // C[repeat_num - 1]
-    compute(nram_input, base_points_count, deal_h, repeat_num - 1);
-  }
-  __sync();
-  if (repeat_num > 0) {
-    // S[repeat_num - 1]
-    output_addr = base_input_addr + (repeat_num - 1) * deal_h;
-    store(output_addr, nram_input, deal_h, repeat_num - 1);
-  }
-  if (rem_num > 0) {
-    // C[repeat_num]
-    compute(nram_input, base_points_count, rem_num, repeat_num);
-    __sync();
-    // S[repeat_num]
-    output_addr = base_input_addr + repeat_num * deal_h;
-    store(output_addr, nram_input, deal_h, repeat_num);
-  }
-}
-
-__mlu_global__ void MLUKernelDynamicPointToVoxelForward(
-    mluOpReduceMode_t reduce_mode, const float *feats, int32_t num_points,
-    int32_t num_feats, int32_t *voxel_coors, int32_t *voxel_num,
-    int *point2voxel_map, int32_t *voxel_points_count, float *voxel_feats) {
-#if __BANG_ARCH__ >= 372
-  if (__is_mpu()) {
-    return;
-  }
-  bool reduce_map = false;
-  if (voxel_coors[0] == -1) {
-    reduce_map = true;
-  }
-  __sync_all_ipu();
-  if (voxel_coors[0] == -1) {
-    if (taskId == 0) {
-      int32_t num_voxel = voxel_num[0] - 1;
-      __gdramset(voxel_num, 1, num_voxel);
-      __memcpy_async(voxel_coors, voxel_coors + COORS_XYZ,
-                     (num_voxel + 1) * COORS_XYZ * sizeof(int32_t),
-                     GDRAM2GDRAM);
-      __memcpy_async(voxel_points_count, voxel_points_count + COORS_IDX,
-                     (num_voxel + 1) * COORS_IDX * sizeof(int32_t),
-                     GDRAM2GDRAM);
-      __sync();
-    }
-  }
-  __sync_all_ipu();
-
-  const int remainder = num_points % taskDim;
-  const int points_per_core = num_points / taskDim + (int)(taskId < remainder);
-  // offset of the point that core processes
-  const int points_offset = taskId * (num_points / taskDim) +
-                            (taskId < remainder ? taskId : remainder);
-  // nram space
-  // |feats|
-  const int max_deal_h = ((MAX_NRAM_SIZE - sizeof(int32_t)) / sizeof(float));
-  int deal_h = 0;
-  int deal_p = 0;
-  if (num_feats > max_deal_h) {
-    deal_p = 1;
-    deal_h = max_deal_h;
-  } else {
-    deal_h = num_feats;
-    deal_p = (MAX_NRAM_SIZE / (deal_h * sizeof(float) + sizeof(int)));
-  }
-
-  float *nram_feats = (float *)nram_buffer;
-  int32_t *nram_map = (int32_t *)nram_feats + deal_p * deal_h;
-  const float *base_feats = feats + points_offset * num_feats;
-  int32_t *base_map = point2voxel_map + points_offset;
-  const int repeat_p = points_per_core / deal_p;
-  const int rem_p = points_per_core % deal_p;
-  const int repeat_h = num_feats / deal_h;
-  const int rem_h = num_feats % deal_h;
-
-  for (int32_t p_iter = 0; p_iter <= repeat_p; p_iter++) {
-    int32_t deal_p_num = (p_iter < repeat_p) ? deal_p : rem_p;
-    if (deal_p_num == 0) {
-      break;
-    }
-    int32_t deal_p_num_offset = p_iter * deal_p * num_feats;
-    int32_t deal_map_offset = p_iter * deal_p * 1;
-    int32_t *base_map_addr = base_map + deal_map_offset;
-    // load map
-    __memcpy(nram_map, base_map_addr, deal_p_num * sizeof(int32_t), GDRAM2NRAM);
-    for (int32_t h_iter = 0; h_iter <= repeat_h; h_iter++) {
-      int32_t deal_h_num = (h_iter < repeat_h) ? deal_h : rem_h;
-      if (deal_h_num == 0) {
-        break;
-      }
-      int32_t deal_h_num_offset = deal_p_num_offset + h_iter * deal_h;
-      const float *base_feats_addr = base_feats + deal_h_num_offset;
-      // load
-      __memcpy_async(nram_feats, base_feats_addr,
-                     deal_p_num * deal_h_num * sizeof(float), GDRAM2NRAM);
-      if (reduce_map) {
-        __bang_add_scalar(nram_map, nram_map, -1, deal_p_num);
-      }
-      __sync();
-      // index and atomic
-      for (int32_t i = 0; i < deal_p_num; i++) {
-        int reduce_to = nram_map[i];
-        if (reduce_to == -1) continue;
-        float *voxel_feats_offset =
-            voxel_feats + reduce_to * num_feats + h_iter * deal_h;
-        if (reduce_mode == MLUOP_REDUCE_DMAX) {
-          __bang_atomic_reduce_max(voxel_feats_offset, nram_feats + i * deal_h,
-                                   deal_h_num);
-        } else {
-          __bang_atomic_reduce_add(voxel_feats_offset, nram_feats + i * deal_h,
-                                   deal_h_num);
-        }
-      }
-    }
-    // store map
-    if (reduce_map) {
-      __memcpy(base_map_addr, nram_map, deal_p_num * sizeof(int32_t),
-               NRAM2GDRAM);
-    }
-  }
-  __sync_all_ipu();
-
-  int32_t num_voxel = voxel_num[0];
-  if (reduce_mode == MLUOP_REDUCE_DMEAN) {
-    const int rem_voxel = num_voxel % taskDim;
-    const int voxel_per_core = num_voxel / taskDim + (int)(taskId < rem_voxel);
-    // offset of the point that core processes
-    const int voxel_offset = taskId * (num_voxel / taskDim) +
-                             (taskId < rem_voxel ? taskId : rem_voxel);
-    // nram space
-    // |voxel_points_count|
-    // |voxel_feats_ping|voxel_feats_pong|
-    const int max_deal_h =
-        (MAX_NRAM_SIZE - sizeof(int32_t)) / (4 * sizeof(float));
-    int deal_h = 0;
-    int deal_v = 0;
-    if (num_feats > max_deal_h) {
-      deal_v = 1;
-      deal_h = max_deal_h;
-    } else {
-      deal_h = num_feats;
-      deal_v = (MAX_NRAM_SIZE - 4 * deal_h * sizeof(float)) / (sizeof(int32_t));
-    }
-
-    int real_deal_v = deal_v > voxel_per_core ? voxel_per_core : deal_v;
-
-    int *nram_points_count = (int *)nram_buffer;
-    float *voxel_feats_ping = (float *)(nram_points_count + real_deal_v);
-    int *base_points_count = (int *)voxel_points_count + voxel_offset;
-    float *base_voxel_feats = (float *)voxel_feats + voxel_offset * num_feats;
-    const int repeat_v = voxel_per_core / deal_v;
-    const int rem_v = voxel_per_core % deal_v;
-    const int repeat_h = num_feats / deal_h;
-    const int rem_h = num_feats % deal_h;
-    for (int v_iter = 0; v_iter <= repeat_v; v_iter++) {
-      int deal_v_num = (v_iter < repeat_v) ? deal_v : rem_v;
-      if (deal_v_num == 0) {
-        break;
-      }
-      float *base_voxel_feats_addr =
-          base_voxel_feats + v_iter * deal_v * num_feats;
-      int *base_points_count_addr = base_points_count + v_iter * deal_v;
-      __memcpy(nram_points_count, base_points_count_addr,
-               deal_v_num * sizeof(int), GDRAM2NRAM);
-      if (num_feats <= max_deal_h) {
-        // L(vi=0)
-        if (deal_v_num > 0) {
-          float *input_addr = base_voxel_feats_addr;
-          load(input_addr, voxel_feats_ping, deal_h, 0);
-          __sync();
-        }
-
-        if (deal_v_num > 1) {
-          // L(vi=1)
-          float *input_addr = base_voxel_feats_addr + deal_h;
-          load(input_addr, voxel_feats_ping, deal_h, 1);
-          // C(vi=0)
-          compute(voxel_feats_ping, nram_points_count, deal_h, 0);
-          __sync();
-        }
-
-        for (int vi = 0; vi < deal_v_num - 2; vi++) {
-          // S(vi)
-          float *output_addr = base_voxel_feats_addr + vi * deal_h;
-          store(output_addr, voxel_feats_ping, deal_h, vi);
-          // C(vi+1)
-          compute(voxel_feats_ping, nram_points_count, deal_h, vi + 1);
-          // L(vi+2)
-          float *input_addr = base_voxel_feats_addr + (vi + 2) * deal_h;
-          load(input_addr, voxel_feats_ping, deal_h, vi + 2);
-          __sync();
-        }
-
-        if (deal_v_num > 1) {
-          // S(vi = deal_v_num - 2)
-          float *output_addr =
-              base_voxel_feats_addr + (deal_v_num - 2) * deal_h;
-          store(output_addr, voxel_feats_ping, deal_h, deal_v_num - 2);
-          __sync();
-        }
-        if (deal_v_num > 0) {
-          // C[deal_v_num - 1]
-          compute(voxel_feats_ping, nram_points_count, deal_h, deal_v_num - 1);
-        }
-        __sync();
-        if (deal_v_num > 0) {
-          // S[deal_v_num - 1]
-          float *output_addr =
-              base_voxel_feats_addr + (deal_v_num - 1) * deal_h;
-          store(output_addr, voxel_feats_ping, deal_h, deal_v_num - 1);
-        }
-      } else {
-        // vi = points_offset + v_iter
-        lcsFunc(base_voxel_feats_addr, nram_points_count, voxel_feats_ping,
-                repeat_h, rem_h, deal_h);
-      }
-    }
-  }
-#endif
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelDynamicPointToVoxelForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpReduceMode_t reduce_mode, const void *feats, int32_t num_points,
-    int32_t num_feats, void *voxel_coors, void *voxel_num,
-    void *point2voxel_map, void *voxel_points_count, void *voxel_feats) {
-  KERNEL_CHECK(MLUKernelDynamicPointToVoxelForward<<<k_dim, k_type, queue>>>(
-      reduce_mode, (float *)feats, num_points, num_feats,
-      (int32_t *)voxel_coors, (int32_t *)voxel_num, (int *)point2voxel_map,
-      (int32_t *)voxel_points_count, (float *)voxel_feats));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu b/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu
deleted file mode 100644
index 63f350b47..000000000
--- a/kernels/dynamic_point_to_voxel_forward/dynamic_point_to_voxel_mask_block.mlu
+++ /dev/null
@@ -1,175 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "dynamic_point_to_voxel_forward.h"
-
-#include "core/logging.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-#define COORS_XYZ 3
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-__mlu_func__ void load(const int32_t *input_addr, int32_t *nram_input,
-                       const int32_t pingpong, const int32_t deal_num,
-                       const int32_t pi) {
-  int32_t offset = (pi % 2) * pingpong;
-  int32_t *nram_input_ptr = nram_input + offset;
-  __memcpy_async(nram_input_ptr, input_addr, deal_num * sizeof(int),
-                 GDRAM2NRAM);
-}
-
-__mlu_func__ void compute(int32_t *coors_ping_in, int32_t *mask_x,
-                          const int32_t pingpong, int32_t deal_num,
-                          const int32_t pi) {
-  int32_t offset = (pi % 2) * pingpong;
-  int32_t *coors_ping_in_p = coors_ping_in + offset;
-  int32_t *coors_ping_out_p = coors_ping_in_p + pingpong / 2;
-  int32_t N = deal_num / COORS_XYZ;
-  int32_t *mask_y = mask_x + N;
-  int32_t *mask_z = mask_x + 2 * N;
-  __bang_transpose(coors_ping_out_p, coors_ping_in_p, N, 3);
-
-  __bang_int322float((float *)coors_ping_out_p, coors_ping_out_p, deal_num, 0);
-  __bang_int322float((float *)mask_x, mask_x, deal_num, 0);
-
-  __bang_lt_scalar((float *)mask_x, (float *)coors_ping_out_p, 0, 3 * N);
-  __bang_float2int32((int32_t *)coors_ping_out_p, (float *)coors_ping_out_p,
-                     deal_num, 0);
-  __bang_float2int32((int32_t *)mask_x, (float *)mask_x, deal_num, 0);
-  __bang_add((int *)mask_x, (int *)mask_x, (int *)mask_y, N);
-  __bang_add((int *)mask_x, (int *)mask_x, (int *)mask_z, N);
-  __bang_not((int *)mask_x, (int *)mask_x, N);
-
-  __bang_cycle_mul((int *)coors_ping_out_p, (int *)coors_ping_out_p,
-                   (int *)mask_x, deal_num, N);
-  __bang_add_scalar((int *)mask_x, (int *)mask_x, -1, N);
-  __bang_cycle_add((int *)coors_ping_out_p, (int *)coors_ping_out_p,
-                   (int *)mask_x, deal_num, N);
-  __bang_transpose(coors_ping_in_p, coors_ping_out_p, 3, N);
-}
-
-__mlu_func__ void store(int32_t *output_addr, int32_t *nram_output,
-                        const int32_t pingpong, const int32_t deal_num,
-                        const int32_t pi) {
-  int32_t offset = (pi % 2) * pingpong;
-  int32_t *nram_output_ptr = nram_output + offset;
-  __memcpy_async(output_addr, nram_output_ptr, deal_num * sizeof(int),
-                 NRAM2GDRAM);
-}
-
-__mlu_global__ void MLUKernelMaskFillCoorsForward(int32_t num_points,
-                                                  int32_t *coors) {
-#if __BANG_ARCH__ >= 372
-  if (__is_mpu()) {
-    return;
-  }
-  int32_t remainder = num_points % taskDim;
-  int32_t points_per_core =
-      num_points / taskDim + (int32_t)(taskId < remainder);
-  // offset of the point32_t that core processes
-  int32_t points_offset = taskId * (num_points / taskDim) +
-                          (taskId < remainder ? taskId : remainder);
-  // nram space
-
-  // |coors_ping_in|coors_ping_out|coors_pong_in|coors_pong_out|mask_x|mask_y|mask_z|
-  int32_t max_deal_num =
-      FLOOR_ALIGN(MAX_NRAM_SIZE / 5, COORS_XYZ * sizeof(int32_t)) /
-      sizeof(int32_t);
-  int32_t coors_num = points_per_core * COORS_XYZ;
-  int32_t deal_num = max_deal_num > coors_num ? coors_num : max_deal_num;
-  int32_t repeat_n = coors_num / max_deal_num;
-  int32_t rem_num = coors_num % max_deal_num;
-
-  int32_t *coors_ping_in = (int32_t *)nram_buffer;
-  int32_t *mask_x = (int32_t *)coors_ping_in + 4 * deal_num;
-  int32_t pingpong = 2 * deal_num;
-  int32_t *base_coors = (int32_t *)coors + points_offset * COORS_XYZ;
-
-  if (repeat_n > 0) {
-    int32_t *input_addr = base_coors;
-    load(input_addr, coors_ping_in, pingpong, deal_num, 0);
-    __sync();
-  }
-
-  if (repeat_n > 1) {
-    // L(vi=1)
-    int32_t *input_addr = base_coors + deal_num;
-    load(input_addr, coors_ping_in, pingpong, deal_num, 1);
-    // C(vi=0)
-    compute(coors_ping_in, mask_x, pingpong, deal_num, 0);
-    __sync();
-  }
-
-  for (int32_t v_iter = 0; v_iter < repeat_n - 2; v_iter++) {
-    // S(vi)
-    int32_t *output_addr = base_coors + v_iter * deal_num;
-    store(output_addr, coors_ping_in, pingpong, deal_num, v_iter);
-    // C(vi+1)
-    compute(coors_ping_in, mask_x, pingpong, deal_num, v_iter + 1);
-    // L(vi+2)
-    int32_t *input_addr = base_coors + (v_iter + 2) * deal_num;
-    load(input_addr, coors_ping_in, pingpong, deal_num, v_iter + 2);
-    __sync();
-  }
-
-  if (repeat_n > 1) {
-    // S(vi = repeat_n - 2)
-    int32_t *output_addr = base_coors + (repeat_n - 2) * deal_num;
-    store(output_addr, coors_ping_in, pingpong, deal_num, repeat_n - 2);
-  }
-  if (rem_num > 0) {
-    // L[repeat_n]
-    int32_t *input_addr = base_coors + repeat_n * deal_num;
-    load(input_addr, coors_ping_in, pingpong, rem_num, repeat_n);
-  }
-  if (repeat_n > 0) {
-    // C[repeat_n - 1]
-    compute(coors_ping_in, mask_x, pingpong, deal_num, repeat_n - 1);
-  }
-  __sync();
-  if (repeat_n > 0) {
-    // S[repeat_n - 1]
-    int32_t *output_addr = base_coors + (repeat_n - 1) * deal_num;
-    store(output_addr, coors_ping_in, pingpong, deal_num, repeat_n - 1);
-  }
-  if (rem_num > 0) {
-    // C[repeat_n]
-    compute(coors_ping_in, mask_x, pingpong, rem_num, repeat_n);
-    __sync();
-    // S[repeat_n]
-    int32_t *output_addr = base_coors + repeat_n * deal_num;
-    store(output_addr, coors_ping_in, pingpong, rem_num, repeat_n);
-  }
-
-#endif
-}
-
-mluOpStatus_t MLUOP_WIN_API
-KernelMaskFillCoorsForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                           cnrtQueue_t queue, int32_t num_points, void *coors) {
-  KERNEL_CHECK(MLUKernelMaskFillCoorsForward<<<k_dim, k_type, queue>>>(
-      num_points, (int32_t *)coors));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/get_indice_pairs/get_indice_pairs.cpp b/kernels/get_indice_pairs/get_indice_pairs.cpp
deleted file mode 100644
index f70489dd2..000000000
--- a/kernels/get_indice_pairs/get_indice_pairs.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/mlu_env.h"
-#include "core/tensor.h"
-#include "kernels/get_indice_pairs/get_indice_pairs_structs.h"
-#include "kernels/get_indice_pairs/normal_get_indice_pairs.h"
-#include "mlu_op.h"
-
-static void getIndicePairsGencase(
-    mluOpHandle_t handle,
-    const mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const mluOpTensorDescriptor_t indices_desc, const void *indices,
-    const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs,
-    const mluOpTensorDescriptor_t out_indices_desc, void *out_indices,
-    const mluOpTensorDescriptor_t indice_num_desc, void *indice_num) {
-  GEN_CASE_START("get_indice_pairs");
-  GEN_CASE_HANDLE(handle);
-  GEN_CASE_DATA_REAL(true, "indices", indices, indices_desc);
-  GEN_CASE_DATA_REAL(false, "out_indices", out_indices, out_indices_desc);
-  GEN_CASE_DATA_REAL(false, "indice_pairs", indice_pairs, indice_pairs_desc);
-  GEN_CASE_DATA_REAL(false, "indice_num", indice_num, indice_num_desc);
-  GEN_CASE_OP_PARAM_SINGLE(0, "get_indice_pairs", "dimnb",
-                           sparse_conv_desc->dimNb);
-  GEN_CASE_OP_PARAM_SINGLE(0, "get_indice_pairs", "batch",
-                           sparse_conv_desc->batch);
-  GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "pad", sparse_conv_desc->pad,
-                          sparse_conv_desc->dimNb == 4 ? 2 : 3);
-  GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "stride",
-                          sparse_conv_desc->stride,
-                          sparse_conv_desc->dimNb == 4 ? 2 : 3);
-  GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "dilation",
-                          sparse_conv_desc->dilation,
-                          sparse_conv_desc->dimNb == 4 ? 2 : 3);
-  GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "input_space",
-                          sparse_conv_desc->input_space,
-                          sparse_conv_desc->dimNb == 4 ? 2 : 3);
-  GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "filter_space",
-                          sparse_conv_desc->filter_space,
-                          sparse_conv_desc->dimNb == 4 ? 2 : 3);
-  GEN_CASE_OP_PARAM_ARRAY(1, "get_indice_pairs", "output_space",
-                          sparse_conv_desc->output_space,
-                          sparse_conv_desc->dimNb == 4 ? 2 : 3);
-  GEN_CASE_OP_PARAM_SINGLE(2, "get_indice_pairs", "sub_m",
-                           sparse_conv_desc->sub_m);
-  GEN_CASE_OP_PARAM_SINGLE(2, "get_indice_pairs", "transpose",
-                           sparse_conv_desc->transpose);
-  GEN_CASE_OP_PARAM_SINGLE(2, "get_indice_pairs", "inverse",
-                           sparse_conv_desc->inverse);
-  GEN_CASE_HANDLE_PARAM();
-  GEN_CASE_TEST_PARAM_NEW(false, false, true, 0.003, 0.003, 0);
-}
-
-static mluOpStatus_t internalGetIndicePairs(
-    mluOpHandle_t handle, const std::string interface_name,
-    mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const mluOpTensorDescriptor_t indices_desc, const void *indices,
-    void *workspace, size_t workspace_size,
-    const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs,
-    const mluOpTensorDescriptor_t out_indices_desc, void *out_indices,
-    const mluOpTensorDescriptor_t indice_num_desc, void *indice_num,
-    const bool is_get_workspace, size_t *return_ws) {
-  PARAM_CHECK(interface_name, handle != NULL);
-  PARAM_CHECK(interface_name, sparse_conv_desc != NULL);
-  PARAM_CHECK(interface_name, indices_desc != NULL);
-  PARAM_CHECK(interface_name, indice_pairs_desc != NULL);
-  PARAM_CHECK(interface_name, out_indices_desc != NULL);
-  PARAM_CHECK(interface_name, indice_num_desc != NULL);
-
-  // check platform
-  if (handle->arch < 372) {
-    LOG(ERROR) << interface_name
-               << " Only mlu300 and above devices are supported."
-               << " Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-
-  // sparse_conv_desc dimNb  check
-  int sparse_conv_dimNb = sparse_conv_desc->dimNb;
-
-  // indices  indice_pairs out_indices indice_num
-  // tensor dim check
-  PARAM_CHECK(interface_name, indices_desc->dim == 2);
-  PARAM_CHECK(interface_name, indice_pairs_desc->dim == 3);
-  PARAM_CHECK(interface_name, out_indices_desc->dim == 2);
-  PARAM_CHECK(interface_name, indice_num_desc->dim == 1);
-  PARAM_CHECK(interface_name, indices_desc->dims[1] == 4);
-  PARAM_CHECK(interface_name, out_indices_desc->dims[1] == 4);
-  PARAM_CHECK(interface_name, indice_pairs_desc->dims[1] == 2);
-
-  // check shape
-  PARAM_CHECK(interface_name,
-              indice_pairs_desc->dims[2] == indices_desc->dims[0]);
-  PARAM_CHECK(interface_name,
-              indice_pairs_desc->dims[0] == indice_num_desc->dims[0]);
-  int kernel_volume = 1;
-  for (int i = 0; i < sparse_conv_dimNb - 2; i++) {
-    kernel_volume *= sparse_conv_desc->filter_space[i];
-  }
-  int output_spaces = sparse_conv_desc->batch;
-  int input_spaces = sparse_conv_desc->batch;
-  for (int i = 0; i < sparse_conv_dimNb - 2; i++) {
-    output_spaces *= sparse_conv_desc->output_space[i];
-    input_spaces *= sparse_conv_desc->input_space[i];
-  }
-  PARAM_CHECK_LE(interface_name, indices_desc->dims[0], input_spaces);
-  for (int i = 0; i < sparse_conv_dimNb - 2; i++) {
-    PARAM_CHECK_GE(interface_name, sparse_conv_desc->pad[i], 0);
-    PARAM_CHECK_GE(interface_name, sparse_conv_desc->dilation[i], 1);
-    PARAM_CHECK_GE(interface_name, sparse_conv_desc->stride[i], 1);
-    if (sparse_conv_desc->dilation[i] != 1 &&
-        sparse_conv_desc->stride[i] != 1) {
-      return MLUOP_STATUS_BAD_PARAM;
-    }
-  }
-  PARAM_CHECK(interface_name, indice_pairs_desc->dims[0] == kernel_volume);
-  PARAM_CHECK_LE(interface_name, kernel_volume, 4096);
-  PARAM_CHECK_LE(interface_name, out_indices_desc->dims[0], output_spaces);
-
-  // large tensor
-  PARAM_CHECK_LE(interface_name, indices_desc->dims[0],
-                 INDICE_IN_LARGE_TENSOR_NUM);
-  if (mluOpGetTensorElementNum(indices_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(out_indices_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(indice_pairs_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(indice_num_desc) >= LARGE_TENSOR_NUM) {
-    LOG(ERROR) << interface_name << " Overflow max tensor num."
-               << " Currently, MLU-OPS supports tensor num smaller than 2^31.";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  // tensor  datatype check
-  PARAM_CHECK_EQ(interface_name, indices_desc->dtype, MLUOP_DTYPE_INT32);
-  PARAM_CHECK_EQ(interface_name, indice_pairs_desc->dtype, MLUOP_DTYPE_INT32);
-  PARAM_CHECK_EQ(interface_name, out_indices_desc->dtype, MLUOP_DTYPE_INT32);
-  PARAM_CHECK_EQ(interface_name, indice_num_desc->dtype, MLUOP_DTYPE_INT32);
-  // special check
-  int sub_m = sparse_conv_desc->sub_m;
-  if (sub_m) {
-    for (int i = 0; i < sparse_conv_dimNb - 2; i++) {
-      PARAM_CHECK_EQ(interface_name, sparse_conv_desc->input_space[i],
-                     sparse_conv_desc->output_space[i]);
-      PARAM_CHECK_EQ(interface_name, sparse_conv_desc->stride[i], 1);
-      PARAM_CHECK_EQ(interface_name, sparse_conv_desc->dilation[i], 1);
-    }
-  }
-
-  // check zero elment
-  if (mluOpGetTensorElementNum(indices_desc) == 0 ||
-      mluOpGetTensorElementNum(indice_pairs_desc) == 0 ||
-      mluOpGetTensorElementNum(out_indices_desc) == 0 ||
-      mluOpGetTensorElementNum(indice_num_desc) == 0) {
-    sparse_conv_desc->num_act_out = 0;
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // check nullptr
-  if (!is_get_workspace) {
-    PARAM_CHECK(interface_name, indices != NULL);
-    PARAM_CHECK(interface_name, indice_pairs != NULL);
-    PARAM_CHECK(interface_name, out_indices != NULL);
-    PARAM_CHECK(interface_name, indice_num != NULL);
-    if (workspace_size != 0) {
-      PARAM_CHECK(interface_name, workspace != NULL);
-    }
-  }
-  // gencase
-  if (!is_get_workspace && MLUOP_GEN_CASE_ON_NEW) {
-    getIndicePairsGencase(handle, sparse_conv_desc, indices_desc, indices,
-                          indice_pairs_desc, indice_pairs, out_indices_desc,
-                          out_indices, indice_num_desc, indice_num);
-  }
-
-  // call normal implementaion
-  mluOpStatus_t return_status;
-  return_status = normalGetIndicePairs(
-      handle, interface_name, sparse_conv_desc, indices_desc, indices,
-      workspace, workspace_size, indice_pairs_desc, indice_pairs,
-      out_indices_desc, out_indices, indice_num_desc, indice_num,
-      is_get_workspace, return_ws);
-
-  if (!is_get_workspace) {
-    GEN_CASE_END();
-  }
-  return return_status;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetIndicePairs(
-    mluOpHandle_t handle, mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const mluOpTensorDescriptor_t indices_desc, const void *indices,
-    void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs,
-    const mluOpTensorDescriptor_t out_indices_desc, void *out_indices,
-    const mluOpTensorDescriptor_t indice_num_desc, void *indice_num) {
-  std::string interface_name = "[mluOpGetIndicesPairs]";
-  return internalGetIndicePairs(
-      handle, interface_name, sparse_conv_desc, indices_desc, indices,
-      workspace, workspace_size, indice_pairs_desc, indice_pairs,
-      out_indices_desc, out_indices, indice_num_desc, indice_num, false, NULL);
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetIndicePairsWorkspaceSize(
-    mluOpHandle_t handle, mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const mluOpTensorDescriptor_t indices_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc,
-    const mluOpTensorDescriptor_t out_indices_desc,
-    const mluOpTensorDescriptor_t indice_num_desc, size_t *workspace_size) {
-  std::string interface_name = "[mluOpGetIndicePairsWorkspaceSize]";
-  PARAM_CHECK(interface_name, handle != NULL);
-  PARAM_CHECK(interface_name, sparse_conv_desc != NULL);
-  PARAM_CHECK(interface_name, indices_desc != NULL);
-  PARAM_CHECK(interface_name, indice_pairs_desc != NULL);
-  PARAM_CHECK(interface_name, out_indices_desc != NULL);
-  PARAM_CHECK(interface_name, indice_num_desc != NULL);
-  PARAM_CHECK(interface_name, workspace_size != NULL);
-  if (mluOpGetTensorElementNum(indices_desc) == 0 ||
-      mluOpGetTensorElementNum(indice_pairs_desc) == 0 ||
-      mluOpGetTensorElementNum(out_indices_desc) == 0 ||
-      mluOpGetTensorElementNum(indice_num_desc) == 0) {
-    workspace_size[0] = 0;
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  return internalGetIndicePairs(handle, interface_name, sparse_conv_desc,
-                                indices_desc, NULL, NULL, 0, indice_pairs_desc,
-                                NULL, out_indices_desc, NULL, indice_num_desc,
-                                NULL, true, workspace_size);
-}
diff --git a/kernels/get_indice_pairs/get_indice_pairs_block.mlu b/kernels/get_indice_pairs/get_indice_pairs_block.mlu
deleted file mode 100644
index c249b6e7d..000000000
--- a/kernels/get_indice_pairs/get_indice_pairs_block.mlu
+++ /dev/null
@@ -1,558 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <algorithm>
-
-#include "core/logging.h"
-#include "kernels/get_indice_pairs/get_indice_pairs_utils.h"
-#include "kernels/get_indice_pairs/normal_get_indice_pairs.h"
-#include "kernels/kernel.h"
-
-#define KERNEL_V (4096)
-#define NRAM_LIMIT \
-  (MAX_NRAM_SIZE + REM_FOR_STACK - 12 * 1024 - 3 * KERNEL_V * sizeof(float))
-
-#define Ndim (4)
-__nram__ float filter_kd_index[KERNEL_V];
-__nram__ float filter_kh_index[KERNEL_V];
-__nram__ float filter_kw_index[KERNEL_V];
-
-__nram__ char nbuf_total[NRAM_LIMIT];
-
-__mlu_func__ void computeIndex(int32_t *nram_output, int32_t *nram_input,
-                               int32_t *nram_aux_a, float *nram_aux_b,
-                               OutputSpace output_space, Stride stride,
-                               Dilation dilation, Padding padding,
-                               int32_t deal_num, int32_t step_index_start,
-                               int32_t k_dhw, int32_t batch) {
-#if __BANG_ARCH__ >= 370
-  int32_t len_l_dim = deal_num * (Ndim + 1);
-  int32_t deal_num_lk = deal_num * k_dhw;
-  int32_t output_size =
-      batch * output_space.o_d * output_space.o_h * output_space.o_w + 1;
-  __bang_transpose((int32_t *)nram_aux_a, (int32_t *)nram_input, deal_num,
-                   Ndim);
-  stepIndex((int32_t *)nram_aux_a + deal_num * Ndim, step_index_start,
-            deal_num);
-  expandInput((int32_t *)nram_aux_a, len_l_dim, k_dhw);
-  __bang_transpose((int32_t *)nram_aux_b, (int32_t *)nram_aux_a, k_dhw,
-                   len_l_dim);
-  __bang_transpose((int32_t *)nram_output + deal_num_lk,
-                   (int32_t *)nram_aux_b + deal_num_lk * Ndim, deal_num, k_dhw);
-  __bang_int322float_rn((float *)nram_aux_b, (int32_t *)nram_aux_b,
-                        k_dhw * len_l_dim, 0);
-  computeOutputIndex((float *)nram_aux_b + k_dhw * len_l_dim,
-                     (float *)nram_aux_b, (float *)nram_aux_a, filter_kd_index,
-                     filter_kh_index, filter_kw_index, deal_num_lk, k_dhw,
-                     stride, dilation, padding);
-  computeMask((float *)nram_aux_a + deal_num_lk * Ndim,
-              (float *)nram_aux_b + k_dhw * len_l_dim,
-              (float *)nram_aux_a + deal_num_lk, deal_num_lk, output_space);
-  __bang_float2int32_tz((int32_t *)nram_aux_a,
-                        (float *)nram_aux_a + deal_num_lk * Ndim, deal_num_lk,
-                        0);
-  __bang_transpose((int32_t *)nram_output, (int32_t *)nram_aux_a, deal_num,
-                   k_dhw);
-  genIndiceOutput((int32_t *)nram_aux_a + deal_num_lk, (float *)nram_aux_b,
-                  (float *)nram_aux_b + k_dhw * len_l_dim,
-                  (int32_t *)nram_aux_a + 2 * deal_num_lk, deal_num_lk,
-                  output_space);
-  genIndiceOutExpand((int32_t *)nram_aux_a + 2 * deal_num_lk,
-                     (int32_t *)nram_aux_a, (int32_t *)nram_aux_a + deal_num_lk,
-                     (int32_t *)nram_aux_a + 3 * deal_num_lk, deal_num_lk,
-                     output_size);
-  __bang_transpose((int32_t *)nram_output + 2 * deal_num_lk,
-                   (int32_t *)nram_aux_a + 2 * deal_num_lk, deal_num, k_dhw);
-#endif
-}
-
-__mlu_global__ void MLUBlockDefaultGetIndicePairKernel1(
-    void *mask_all_ws, void *indice_index_in_ws, void *indice_out_expand_ws,
-    void *indices_in, FilterSpace host_filter_space,
-    InputSpace host_input_space, OutputSpace host_output_space,
-    Stride host_stride, Dilation host_dilation, Padding host_padding,
-    int32_t core_num_l, int32_t input_active_site, int32_t batch) {
-#if __BANG_ARCH__ >= 370
-  /*  nram_space
-   |input| mask_all | indice_index_in | indice_out_expand |  4l +  3 k l
-   |input| mask_all | indice_index_in | indice_out_expand |  4l +  3 k l
-   | nram_aux_a  5 l k | nram_aux_b 8 l k
-  */
-  FilterSpace filter_space = host_filter_space;
-  // InputSpace input_space = host_input_space;
-  OutputSpace output_space = host_output_space;
-  Stride stride = host_stride;
-  Dilation dilation = host_dilation;
-  Padding padding = host_padding;
-  int32_t k_d = filter_space.k_d, k_h = filter_space.k_h,
-          k_w = filter_space.k_w;
-  int32_t k_dhw = k_d * k_h * k_w;
-  genFilterIndex(filter_kd_index, filter_kh_index, filter_kw_index, k_d, k_h,
-                 k_w);
-  int32_t len_l_job = 0, offset_l_job = 0;
-  assignTask(input_active_site, taskIdY, taskDimY, offset_l_job, len_l_job);
-  int32_t repeat = (len_l_job + core_num_l - 1) / core_num_l;
-  int32_t rem_num_l =
-      len_l_job % core_num_l == 0 ? core_num_l : len_l_job % core_num_l;
-  int32_t *nram_input = (int32_t *)nbuf_total;
-  int32_t load_num = core_num_l * Ndim;
-  float *nram_output = (float *)nbuf_total + load_num;
-  int32_t len_l_k = core_num_l * k_dhw;
-  int32_t ping_pong_num = load_num + len_l_k * 3;
-  float *nram_aux_a = (float *)nbuf_total + 2 * ping_pong_num;
-  float *nram_aux_b = (float *)nram_aux_a + len_l_k * (Ndim + 1);
-  int step_index_start = offset_l_job;
-  for (int i = 0; i < repeat + 2; ++i) {
-    if (i < repeat) {
-      int32_t *indices_in_addr =
-          (int32_t *)indices_in + (offset_l_job + i * core_num_l) * Ndim;
-      int32_t *nram_input_t = (int32_t *)nram_input + (i % 2) * ping_pong_num;
-      int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l;
-      __memcpy_async((char *)nram_input_t, (char *)indices_in_addr,
-                     deal_num * Ndim * sizeof(int), GDRAM2NRAM);
-    }
-    if (1 <= i && i < (repeat + 1)) {
-      int32_t deal_num = (i - 1) == repeat - 1 ? rem_num_l : core_num_l;
-      int32_t *nram_input_t =
-          (int32_t *)nram_input + ((i - 1) % 2) * ping_pong_num;
-      int32_t *nram_output_t =
-          (int32_t *)nram_output + ((i - 1) % 2) * ping_pong_num;
-      computeIndex(nram_output_t, nram_input_t, (int32_t *)nram_aux_a,
-                   nram_aux_b, output_space, stride, dilation, padding,
-                   deal_num, step_index_start, k_dhw, batch);
-      step_index_start += deal_num;
-    }
-    if (i >= 2) {
-      int32_t deal_num = (i - 2) == repeat - 1 ? rem_num_l : core_num_l;
-      uint64_t gdram_offset =
-          (offset_l_job + (i - 2) * core_num_l) * sizeof(int);
-      int32_t *nram_output_t =
-          (int32_t *)nram_output + ((i - 2) % 2) * ping_pong_num;
-      __memcpy_async((char *)mask_all_ws + gdram_offset,
-                     (char *)(nram_output_t), deal_num * sizeof(int),
-                     NRAM2GDRAM, input_active_site * sizeof(int),
-                     deal_num * sizeof(int), k_dhw - 1);
-      __memcpy_async((char *)indice_index_in_ws + gdram_offset,
-                     (char *)(nram_output_t + deal_num * k_dhw),
-                     deal_num * sizeof(int), NRAM2GDRAM,
-                     input_active_site * sizeof(int), deal_num * sizeof(int),
-                     k_dhw - 1);
-      __memcpy_async((char *)indice_out_expand_ws + gdram_offset,
-                     (char *)(nram_output_t + 2 * deal_num * k_dhw),
-                     deal_num * sizeof(int), NRAM2GDRAM,
-                     input_active_site * sizeof(int), deal_num * sizeof(int),
-                     k_dhw - 1);
-    }
-    __sync();
-  }
-#endif
-}
-
-__mlu_global__ void MLUBlockDefaultGetIndicePairKernel2(void *index_output_ptr,
-                                                        int32_t num_act_out,
-                                                        int32_t core_num_l) {
-#if __BANG_ARCH__ >= 370
-  int32_t len_job = 0, offset_job = 0;
-  assignTask(num_act_out, taskIdY, taskDimY, offset_job, len_job);
-  int32_t repeat = (len_job + core_num_l - 1) / core_num_l;
-  int32_t rem_num_l =
-      len_job % core_num_l == 0 ? core_num_l : len_job % core_num_l;
-  int32_t *nram_input = (int32_t *)nbuf_total;
-  for (int i = 0; i < repeat; ++i) {
-    int32_t start_index = offset_job + i * core_num_l;
-    int32_t length = i == (repeat - 1) ? rem_num_l : core_num_l;
-    stepIndex((int32_t *)nram_input, start_index, length);  //  sync
-    int32_t *output_ptr = (int32_t *)index_output_ptr + start_index;
-    __memcpy((char *)output_ptr, nram_input, length * sizeof(int), NRAM2GDRAM);
-  }
-#endif
-}
-
-__mlu_global__ void MLUBlockBalanceGetIndicePairKernel(
-    void *balance_input, void *balance_mask, void *balance_output,
-    int32_t len_l, int32_t kernel_volume, int32_t core_num_l,
-    int32_t output_size) {
-#if __BANG_ARCH__ >= 370
-  int32_t len_job, offset_job = 0;
-  assignTask(len_l * kernel_volume, taskIdY, taskDimY, offset_job, len_job);
-  int32_t repeat = (len_job + core_num_l - 1) / core_num_l;
-  int32_t rem_num_l =
-      len_job % core_num_l == 0 ? core_num_l : len_job % core_num_l;
-  int32_t *nram_random_num = (int32_t *)nbuf_total;
-  int32_t *nram_input = (int32_t *)nbuf_total + core_num_l;
-  int32_t *nram_mask = (int32_t *)nbuf_total + 2 * core_num_l;
-  int32_t *nram_output = (int32_t *)nbuf_total + 3 * core_num_l;
-  int32_t ping_pong_num = 3 * core_num_l;
-  int32_t *nram_aux = (int32_t *)nbuf_total + 7 * core_num_l;
-  int32_t multi_max = output_size / taskDimY;
-  stepIndex(nram_random_num, taskId * multi_max, core_num_l);
-  for (int i = 0; i < repeat + 2; ++i) {
-    if (i < repeat) {
-      int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l;
-      int32_t *balance_input_ptr =
-          (int32_t *)balance_input + offset_job + i * core_num_l;
-      int32_t *balance_mask_ptr =
-          (int32_t *)balance_mask + offset_job + i * core_num_l;
-      int32_t *nram_input_t = (int32_t *)nram_input + (i % 2) * ping_pong_num;
-      int32_t *nram_mask_t = (int32_t *)nram_mask + (i % 2) * ping_pong_num;
-      __memcpy_async((char *)nram_input_t, (char *)balance_input_ptr,
-                     deal_num * sizeof(int), GDRAM2NRAM);
-      __memcpy_async((char *)nram_mask_t, (char *)balance_mask_ptr,
-                     deal_num * sizeof(int), GDRAM2NRAM);
-    }
-    if (1 <= i && i <= repeat) {
-      int32_t deal_num = (i - 1) == repeat - 1 ? rem_num_l : core_num_l;
-      int32_t *nram_input_t =
-          (int32_t *)nram_input + ((i - 1) % 2) * ping_pong_num;
-      int32_t *nram_mask_t =
-          (int32_t *)nram_mask + ((i - 1) % 2) * ping_pong_num;
-      int32_t *nram_output_t =
-          (int32_t *)nram_output + ((i - 1) % 2) * ping_pong_num;
-      __bang_mul_scalar((int32_t *)nram_aux, (int32_t *)nram_mask_t, int(-1),
-                        deal_num);
-      __bang_band((char *)nram_output_t, (char *)nram_input_t, (char *)nram_aux,
-                  deal_num * sizeof(int));
-      __bang_sub_scalar((int32_t *)nram_aux, (int32_t *)nram_mask_t, int(1),
-                        deal_num);
-      __bang_band((char *)nram_aux, (char *)nram_aux, (char *)nram_random_num,
-                  deal_num * sizeof(int));
-      __bang_add((int32_t *)nram_output_t, (int32_t *)nram_output_t,
-                 (int32_t *)nram_aux, deal_num);
-    }
-    if (i >= 2) {
-      int32_t deal_num = (i - 2) == repeat - 1 ? rem_num_l : core_num_l;
-      uint64_t gdram_offset = (offset_job + (i - 2) * core_num_l) * sizeof(int);
-      int32_t *nram_output_t =
-          (int32_t *)nram_output + ((i - 2) % 2) * ping_pong_num;
-      __memcpy_async((char *)balance_output + gdram_offset,
-                     (char *)nram_output_t, deal_num * sizeof(int), NRAM2GDRAM);
-    }
-    __sync();
-  }
-#endif
-}
-
-__mlu_global__ void MLUBlockDefaultGetIndicePairKernel3(
-    void *indice_pair, void *indice_index_ptr, void *mask_all, int32_t len_l,
-    int32_t kernel_volume, int32_t core_num_l) {
-#if __BANG_ARCH__ >= 370
-  int32_t len_l_job = 0, offset_l_job = 0;
-  assignTask(2 * kernel_volume, taskIdY, taskDimY, offset_l_job, len_l_job);
-  float *nram_input = (float *)nbuf_total;
-  float *nram_mask = (float *)nram_input + core_num_l;
-  float *nram_output = (float *)nram_input + core_num_l * 2;
-  float *nram_aux = (float *)nram_input + core_num_l * 3;
-  // | nram_input  | nram_mask | nram_output  | nram_aux |
-  for (int j = 0; j < len_l_job; ++j) {
-    int32_t mask_offset = (offset_l_job + j) % kernel_volume;
-    int32_t indice_store = ((offset_l_job + j) % kernel_volume) * 2;
-    int32_t store_offset =
-        (offset_l_job + j) < kernel_volume ? indice_store : indice_store + 1;
-    int32_t *index_job_start =
-        (int32_t *)indice_index_ptr + (offset_l_job + j) * len_l;
-    int32_t *mask_job_start = (int32_t *)mask_all + mask_offset * len_l;
-    int32_t core_offset_l_valid = 0;
-    int32_t valid_l_num_now = 0;
-    int32_t repeat = (len_l + core_num_l - 1) / core_num_l;
-    int32_t rem_num_l =
-        len_l % core_num_l == 0 ? core_num_l : len_l % core_num_l;
-    for (int i = 0; i < repeat; ++i) {
-      int32_t load_l_num = i == (repeat - 1) ? rem_num_l : core_num_l;
-      int32_t *index_start = (int32_t *)index_job_start + i * core_num_l;
-      int32_t *mask_start = (int32_t *)mask_job_start + i * core_num_l;
-      __memcpy(nram_input, index_start, load_l_num * sizeof(int), GDRAM2NRAM);
-      __memcpy(nram_mask, mask_start, load_l_num * sizeof(int), GDRAM2NRAM);
-      __bang_int322float_rn((float *)nram_aux, (int32_t *)nram_mask, load_l_num,
-                            0);
-      valid_l_num_now = __bang_count((float *)nram_aux, load_l_num);
-      __bang_collect((float *)nram_output, (float *)nram_input,
-                     (float *)nram_aux, load_l_num);
-      int32_t *store_valid_ptr =
-          (int32_t *)indice_pair + store_offset * len_l + core_offset_l_valid;
-      core_offset_l_valid += valid_l_num_now;
-      if (valid_l_num_now > 0) {
-        __memcpy((char *)store_valid_ptr, (char *)nram_output,
-                 valid_l_num_now * sizeof(int32_t), NRAM2GDRAM);
-      }
-    }
-  }
-#endif
-}
-
-__mlu_global__ void MLUBlockDefaultGetIndicePairKernel4(
-    void *indice_out, void *input_ptr, OutputSpace host_output_space,
-    int32_t len_l, int32_t core_num_l) {
-#if __BANG_ARCH__ >= 370
-  OutputSpace output_space = host_output_space;
-  int32_t len_l_job = 0, offset_l_job = 0;
-  assignTask(len_l, taskIdY, taskDimY, offset_l_job, len_l_job);
-  int32_t ping_pong_num = core_num_l * 5;
-  int32_t *nram_input = (int32_t *)nbuf_total;
-  int32_t *nram_output = (int32_t *)nbuf_total + core_num_l;
-  int32_t *nram_aux = (int32_t *)nbuf_total + 2 * ping_pong_num;
-  int32_t *input_start_core = (int32_t *)input_ptr + offset_l_job;
-  // |nram_input | nram_output * 4 | nram_input | nram_output * 4 | nram_aux|
-  int32_t rem_num_l =
-      len_l_job % core_num_l == 0 ? core_num_l : len_l_job % core_num_l;
-  int32_t repeat = (len_l_job + core_num_l - 1) / core_num_l;
-  for (int i = 0; i < repeat + 2; ++i) {
-    if (i < repeat) {
-      int32_t load_num_l = i == (repeat - 1) ? rem_num_l : core_num_l;
-      int32_t *input_start_ptr = input_start_core + i * core_num_l;
-      int32_t *nram_input_load = nram_input + (i % 2) * ping_pong_num;
-      __memcpy_async((char *)nram_input_load, (char *)input_start_ptr,
-                     load_num_l * sizeof(int32_t), GDRAM2NRAM);
-    }
-    if (1 <= i && i < (repeat + 1)) {
-      int32_t load_num_l = (i - 1) == (repeat - 1) ? rem_num_l : core_num_l;
-      int32_t *nram_output_t = nram_output + ((i - 1) % 2) * ping_pong_num;
-      int32_t *nram_input_t = nram_input + ((i - 1) % 2) * ping_pong_num;
-      genIndiceOutLast((int32_t *)nram_output_t, (int32_t *)nram_input_t,
-                       (int32_t *)nram_aux, output_space, load_num_l);
-    }
-    if (i >= 2) {
-      int32_t load_num_l = (i - 2) == (repeat - 1) ? rem_num_l : core_num_l;
-      int32_t *nram_output_t = nram_output + ((i - 2) % 2) * ping_pong_num;
-      int32_t *indice_out_t =
-          (int32_t *)indice_out + (offset_l_job + (i - 2) * core_num_l) * 4;
-      __memcpy_async((char *)indice_out_t, (char *)nram_output_t,
-                     load_num_l * 4 * sizeof(int32_t), NRAM2GDRAM);
-    }
-    __sync();
-  }
-#endif
-}
-
-__mlu_global__ void MLUBlockSubmGetIndicePairKernel1(
-    void *mask_all_ptr, void *indice_index_in_ptr, void *indice_in_expand_ptr,
-    void *indice_out_expand_ptr, void *indices_in,
-    FilterSpace host_filter_space, InputSpace host_input_space,
-    OutputSpace host_output_space, Stride host_stride, Dilation host_dilation,
-    Padding host_padding, int32_t core_num_l, int32_t input_active_site,
-    int32_t batch) {
-#if __BANG_ARCH__ >= 370
-  /*  nram_space
-  |input| mask_all | indice_index_in |  indice_out_expand | indice_in_expand |
-  4l + l + 3kl |input| mask_all | indice_index_in |  indice_out_expand |
-  indice_in_expand | 4l + l + 3kl | nram_aux_a  5lk | nram_aux_b 8lk |
- */
-  FilterSpace filter_space = host_filter_space;
-  InputSpace input_space = host_input_space;
-  OutputSpace output_space = host_output_space;
-  Stride stride = host_stride;
-  Dilation dilation = host_dilation;
-  Padding padding = host_padding;
-  int32_t k_d = filter_space.k_d, k_h = filter_space.k_h,
-          k_w = filter_space.k_w;
-  int32_t k_dhw = k_d * k_h * k_w;
-  genFilterIndex((float *)filter_kd_index, (float *)filter_kh_index,
-                 (float *)filter_kw_index, k_d, k_h, k_w);
-  int32_t len_l_job = 0, offset_l_job = 0;
-  assignTask(input_active_site, taskIdY, taskDimY, offset_l_job, len_l_job);
-  int32_t repeat = (len_l_job + core_num_l - 1) / core_num_l;
-  int32_t rem_num_l =
-      len_l_job % core_num_l == 0 ? core_num_l : len_l_job % core_num_l;
-  int32_t *nram_input = (int32_t *)nbuf_total;
-  int32_t load_num = core_num_l * Ndim;
-  float *nram_output = (float *)nbuf_total + load_num;
-  int32_t len_l_k = core_num_l * k_dhw;
-  int32_t ping_pong_num = load_num + core_num_l + len_l_k * 3;
-  float *nram_aux_a = (float *)nbuf_total + 2 * ping_pong_num;
-  float *nram_aux_b = (float *)nram_aux_a + len_l_k * (Ndim + 1);
-  int step_index_start = offset_l_job;
-  for (int i = 0; i < repeat + 2; ++i) {
-    if (i < repeat) {
-      float *indices_in_addr =
-          (float *)indices_in + (offset_l_job + i * core_num_l) * Ndim;
-      int32_t *nram_input_t = (int32_t *)nram_input + (i % 2) * ping_pong_num;
-      int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l;
-      __memcpy_async((char *)nram_input_t, (char *)indices_in_addr,
-                     deal_num * Ndim * sizeof(int), GDRAM2NRAM);
-    }
-    if (1 <= i && i < (repeat + 1)) {
-      int32_t deal_num = (i - 1) == repeat - 1 ? rem_num_l : core_num_l;
-      int32_t *nram_input_t =
-          (int32_t *)nram_input + ((i - 1) % 2) * ping_pong_num;
-      int32_t *nram_output_t =
-          (int32_t *)nram_output + ((i - 1) % 2) * ping_pong_num;
-      genIndiceInExpand(nram_output_t + 3 * deal_num * k_dhw, nram_input_t,
-                        (int32_t *)nram_aux_a, deal_num, input_space);
-      computeIndex(nram_output_t, nram_input_t, (int32_t *)nram_aux_a,
-                   nram_aux_b, output_space, stride, dilation, padding,
-                   deal_num, step_index_start, k_dhw, batch);
-      step_index_start += deal_num;
-    }
-    if (i >= 2) {
-      int32_t deal_num = (i - 2) == repeat - 1 ? rem_num_l : core_num_l;
-      uint64_t gdram_offset =
-          (offset_l_job + (i - 2) * core_num_l) * sizeof(int32_t);
-      int32_t *nram_output_t =
-          (int32_t *)nram_output + ((i - 2) % 2) * ping_pong_num;
-      __memcpy_async((char *)mask_all_ptr + gdram_offset,
-                     (char *)(nram_output_t), deal_num * sizeof(int),
-                     NRAM2GDRAM, input_active_site * sizeof(int),
-                     deal_num * sizeof(int32_t), k_dhw - 1);
-      __memcpy_async((char *)indice_index_in_ptr + gdram_offset,
-                     (char *)(nram_output_t + deal_num * k_dhw),
-                     deal_num * sizeof(int), NRAM2GDRAM,
-                     input_active_site * sizeof(int),
-                     deal_num * sizeof(int32_t), k_dhw - 1);
-      __memcpy_async((char *)indice_out_expand_ptr + gdram_offset,
-                     (char *)(nram_output_t + 2 * deal_num * k_dhw),
-                     deal_num * sizeof(int), NRAM2GDRAM,
-                     input_active_site * sizeof(int),
-                     deal_num * sizeof(int32_t), k_dhw - 1);
-      __memcpy_async((char *)indice_in_expand_ptr + gdram_offset,
-                     (char *)(nram_output_t + 3 * deal_num * k_dhw),
-                     deal_num * sizeof(int), NRAM2GDRAM);
-    }
-    __sync();
-  }
-#endif
-}
-
-__mlu_global__ void MLUBlockSubmGetIndicePairKernel2(
-    void *indice_out, void *mask_all_ptr, void *indice_out_index_ptr,
-    void *indices_in, int32_t len_1_one, int32_t len_l_two,
-    int32_t core_num_1_one, int32_t core_num_l_two) {
-#if __BANG_ARCH__ >= 370
-  int32_t len_job = 0, offset_job = 0;
-  assignTask(len_1_one, taskIdY, taskDimY, offset_job, len_job);
-  int32_t repeat = (len_job + core_num_1_one - 1) / core_num_1_one;
-  int32_t rem_num_l =
-      len_job % core_num_1_one == 0 ? core_num_1_one : len_job % core_num_1_one;
-  int32_t *nram_input = (int32_t *)nbuf_total;
-  int32_t bit_width = sizeof(int32_t);
-  int32_t *indices_in_offset = (int32_t *)indices_in + offset_job;
-  int32_t *indice_out_offset = (int32_t *)indice_out + offset_job;
-  for (int i = 0; i < repeat; ++i) {
-    int32_t offset = i * core_num_1_one;
-    int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_1_one;
-    __memcpy_async((char *)nram_input, (char *)(indices_in_offset + offset),
-                   deal_num * bit_width, GDRAM2NRAM);
-    __memcpy_async((char *)(indice_out_offset + offset), (char *)nram_input,
-                   deal_num * bit_width, NRAM2GDRAM);
-  }
-
-  assignTask(len_l_two, taskIdY, taskDimY, offset_job, len_job);
-  repeat = (len_job + core_num_l_two - 1) / core_num_l_two;
-  rem_num_l =
-      len_job % core_num_l_two == 0 ? core_num_l_two : len_job % core_num_l_two;
-  int32_t *mask_all_ptr_offset = (int32_t *)mask_all_ptr + offset_job;
-  int32_t *indice_out_index_ptr_offset =
-      (int32_t *)indice_out_index_ptr + offset_job;
-  int32_t *nram_output = (int32_t *)nbuf_total + core_num_l_two;
-  for (int i = 0; i < repeat; ++i) {
-    int32_t offset = i * core_num_l_two;
-    int32_t deal_num = i == repeat - 1 ? rem_num_l : core_num_l_two;
-    __memcpy((char *)nram_input, (char *)(mask_all_ptr_offset + offset),
-             deal_num * bit_width, GDRAM2NRAM);
-    __memcpy((char *)nram_output,
-             (char *)(indice_out_index_ptr_offset + offset),
-             deal_num * bit_width, GDRAM2NRAM);
-    __bang_ge_scalar((int32_t *)nram_output, (int32_t *)nram_output, (int)0,
-                     deal_num);
-    __bang_and((int32_t *)nram_output, (int32_t *)nram_output,
-               (int32_t *)nram_input, deal_num);
-    __memcpy((char *)(mask_all_ptr_offset + offset), (char *)nram_output,
-             deal_num * bit_width, NRAM2GDRAM);
-  }
-#endif
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl1(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *mask_all_ws, void *indice_index_in_ws, void *out_indices_expand_ws,
-    void *indices, FilterSpace filter_space, InputSpace input_space,
-    OutputSpace output_space, Stride stride, Dilation dilation, Padding padding,
-    int32_t core_num_l, int32_t input_active_site, int32_t batch) {
-  KERNEL_CHECK(MLUBlockDefaultGetIndicePairKernel1<<<k_dim, k_type, queue>>>(
-      (void *)mask_all_ws, (void *)indice_index_in_ws,
-      (void *)out_indices_expand_ws, (void *)indices, filter_space, input_space,
-      output_space, stride, dilation, padding, core_num_l, input_active_site,
-      batch));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl2(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *step_index_ptr, int32_t num_act_out, int32_t core_num_l) {
-  KERNEL_CHECK(MLUBlockDefaultGetIndicePairKernel2<<<k_dim, k_type, queue>>>(
-      step_index_ptr, num_act_out, core_num_l));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl3(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *indice_pairs, void *input_addr, void *mask_addr,
-    int32_t input_active_site, int32_t kernel_volume, int32_t core_num_l) {
-  KERNEL_CHECK(MLUBlockDefaultGetIndicePairKernel3<<<k_dim, k_type, queue>>>(
-      indice_pairs, input_addr, mask_addr, input_active_site, kernel_volume,
-      core_num_l));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl4(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *out_indices, void *input_addr, OutputSpace output_space,
-    int32_t len_l, int32_t core_num_l) {
-  KERNEL_CHECK(MLUBlockDefaultGetIndicePairKernel4<<<k_dim, k_type, queue>>>(
-      out_indices, input_addr, output_space, len_l, core_num_l));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelBalanceGetIndicePair(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *balance_input, void *balance_mask, void *balance_output,
-    int32_t len_l, int32_t kernel_volume, int32_t core_num_l,
-    int32_t output_size) {
-  KERNEL_CHECK(MLUBlockBalanceGetIndicePairKernel<<<k_dim, k_type, queue>>>(
-      balance_input, balance_mask, balance_output, len_l, kernel_volume,
-      core_num_l, output_size));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelSubmGetIndicePairKl1(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *mask_all_ptr, void *indice_index_in_ptr, void *indice_in_expand_ptr,
-    void *out_indices_expand_ptr, void *indices, FilterSpace filter_space,
-    InputSpace input_space, OutputSpace output_space, Stride stride,
-    Dilation dilation, Padding padding, int32_t core_num_l,
-    int32_t input_active_site, int32_t batch) {
-  KERNEL_CHECK(MLUBlockSubmGetIndicePairKernel1<<<k_dim, k_type, queue>>>(
-      (void *)mask_all_ptr, (void *)indice_index_in_ptr,
-      (void *)indice_in_expand_ptr, (void *)out_indices_expand_ptr,
-      (void *)indices, filter_space, input_space, output_space, stride,
-      dilation, padding, core_num_l, input_active_site, batch));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelSubmGetIndicePairKl2(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *out_indices, void *mask_all_ptr, void *out_indices_index_ptr,
-    void *indices, int32_t len_1_one, int32_t len_l_two, int32_t core_num_l_one,
-    int32_t core_num_l_two) {
-  KERNEL_CHECK(MLUBlockSubmGetIndicePairKernel2<<<k_dim, k_type, queue>>>(
-      (void *)out_indices, (void *)mask_all_ptr, (void *)out_indices_index_ptr,
-      (void *)indices, len_1_one, len_l_two, core_num_l_one, core_num_l_two));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/get_indice_pairs/get_indice_pairs_structs.cpp b/kernels/get_indice_pairs/get_indice_pairs_structs.cpp
deleted file mode 100644
index 44b00a55b..000000000
--- a/kernels/get_indice_pairs/get_indice_pairs_structs.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <new>
-#include <string>
-
-#include "core/logging.h"
-#include "core/type.h"
-#include "kernels/get_indice_pairs/get_indice_pairs_structs.h"
-#include "mlu_op.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpCreateSparseConvolutionDescriptor(
-    mluOpSparseConvolutionDescriptor_t *desc) {
-  if (desc == NULL) {
-    LOG(ERROR) << "mluOpCreateSparseConvolutionDescriptor failed, "
-               << "can't create desc when desc == NULL.";
-    return MLUOP_STATUS_NOT_INITIALIZED;
-  }
-  mluOpSparseConvolutionStruct *ts =
-      new (std::nothrow) mluOpSparseConvolutionStruct();
-  *desc = ts;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-/* set sparse convolution descriptor.
- * pad_dim_num = input_dim_num - 2, and each dim need two pad value.
- */
-mluOpStatus_t MLUOP_WIN_API mluOpSetSparseConvolutionDescriptor(
-    mluOpSparseConvolutionDescriptor_t sparse_conv_desc, int dimNb, int batch,
-    const int pad[], const int stride[], const int dilation[],
-    const int input_space[], const int filter_space[], const int output_space[],
-    const int sub_m, const int transpose, const int inverse) {
-  std::string interface_name = "[mluOpSetSparseConvolutionDescriptor]";
-  PARAM_CHECK(interface_name, sparse_conv_desc != NULL);
-  PARAM_CHECK(interface_name, pad != NULL);
-  PARAM_CHECK(interface_name, stride != NULL);
-  PARAM_CHECK(interface_name, dilation != NULL);
-  PARAM_CHECK(interface_name, input_space != NULL);
-  PARAM_CHECK(interface_name, filter_space != NULL);
-  PARAM_CHECK(interface_name, output_space != NULL);
-  if (dimNb != 5) {
-    LOG(ERROR) << interface_name << " only "
-               << "support 3D_conv, dimnb should be 5. now dimNb is " << dimNb
-               << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  sparse_conv_desc->dimNb = dimNb;
-
-  if (batch <= 0) {
-    LOG(ERROR) << interface_name << " only "
-               << "support postive batch. now batch is " << batch << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  sparse_conv_desc->batch = batch;
-
-  sparse_conv_desc->sub_m = sub_m;
-
-  if (transpose != 0) {
-    LOG(ERROR) << interface_name << " : not "
-               << "support transpose . now transpose is " << transpose << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  sparse_conv_desc->transpose = transpose;
-
-  if (inverse != 0) {
-    LOG(ERROR) << interface_name << " : not "
-               << "support inverse. now inverse is " << inverse << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  sparse_conv_desc->inverse = inverse;
-
-  int kernel_dim = dimNb - 2;
-  for (int idx = 0; idx < kernel_dim; idx++) {
-    PARAM_CHECK_GE(interface_name, pad[idx], 0);
-    sparse_conv_desc->pad[idx] = pad[idx];
-    PARAM_CHECK_GE(interface_name, stride[idx], 1);
-    sparse_conv_desc->stride[idx] = stride[idx];
-    PARAM_CHECK_GE(interface_name, dilation[idx], 1);
-    sparse_conv_desc->dilation[idx] = dilation[idx];
-    PARAM_CHECK_GE(interface_name, input_space[idx], 1);
-    sparse_conv_desc->input_space[idx] = input_space[idx];
-    PARAM_CHECK_GE(interface_name, filter_space[idx], 1);
-    sparse_conv_desc->filter_space[idx] = filter_space[idx];
-    PARAM_CHECK_GE(interface_name, output_space[idx], 1);
-    sparse_conv_desc->output_space[idx] = output_space[idx];
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetSparseConvolutionNumActOut(
-    mluOpSparseConvolutionDescriptor_t desc, int *num_act_out) {
-  if (desc == NULL || num_act_out == NULL) {
-    LOG(ERROR) << "mluOpCreateSparseConvolutionDescriptor or "
-               << "num_act_out failed "
-               << " Passing NULL ptr to this API.";
-    return MLUOP_STATUS_NOT_INITIALIZED;
-  }
-  int size = 0;
-  size = desc->num_act_out;
-  num_act_out[0] = size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpDestroySparseConvolutionDescriptor(
-    mluOpSparseConvolutionDescriptor_t desc) {
-  if (desc == NULL) {
-    LOG(ERROR) << "mluOpDestroySparseConvolutionDescriptor fail. Passing NULL "
-                  "ptr to this API.";
-    return MLUOP_STATUS_EXECUTION_FAILED;
-  }
-  delete desc;
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/get_indice_pairs/get_indice_pairs_structs.h b/kernels/get_indice_pairs/get_indice_pairs_structs.h
deleted file mode 100644
index 083c33947..000000000
--- a/kernels/get_indice_pairs/get_indice_pairs_structs.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#ifndef KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_STRUCTS_H_
-#define KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_STRUCTS_H_
-
-#include "mlu_op.h"
-
-#define MAX_PAD_DIM 6
-#define MAX_STRIDE_DIM 6
-#define MAX_DILATION_DIM 6
-#define MAX_INPUT_DIM 3
-#define MAX_FILTER_DIM 3
-#define MAX_OUTPUT_DIM 3
-
-#define INDICE_IN_LARGE_TENSOR_NUM 1000000
-
-struct FilterSpace {
-  int k_d;
-  int k_h;
-  int k_w;
-  FilterSpace(const int &k_d_, const int &k_h_, const int &k_w_)
-      : k_d(k_d_), k_h(k_h_), k_w(k_w_) {}
-};
-struct InputSpace {
-  int i_d;
-  int i_h;
-  int i_w;
-  InputSpace(const int &i_d_, const int &i_h_, const int &i_w_)
-      : i_d(i_d_), i_h(i_h_), i_w(i_w_) {}
-};
-
-struct OutputSpace {
-  int o_d;
-  int o_h;
-  int o_w;
-  OutputSpace(const int &o_d_, const int &o_h_, const int &o_w_)
-      : o_d(o_d_), o_h(o_h_), o_w(o_w_) {}
-};
-
-struct Stride {
-  int s_d;
-  int s_h;
-  int s_w;
-  Stride(const int &s_d_, const int &s_h_, const int &s_w_)
-      : s_d(s_d_), s_h(s_h_), s_w(s_w_) {}
-};
-
-struct Dilation {
-  int d_d;
-  int d_h;
-  int d_w;
-  Dilation(const int &d_d_, const int &d_h_, const int &d_w_)
-      : d_d(d_d_), d_h(d_h_), d_w(d_w_) {}
-};
-
-struct Padding {
-  int p_d;
-  int p_h;
-  int p_w;
-  Padding(const int &p_d_, const int &p_h_, const int &p_w_)
-      : p_d(p_d_), p_h(p_h_), p_w(p_w_) {}
-};
-
-struct mluOpSparseConvolutionStruct {
-  int dimNb;
-  int batch;
-  int pad[MAX_PAD_DIM];
-  int stride[MAX_STRIDE_DIM];
-  int dilation[MAX_DILATION_DIM];
-  int input_space[MAX_INPUT_DIM];
-  int filter_space[MAX_FILTER_DIM];
-  int output_space[MAX_OUTPUT_DIM];
-  int sub_m = 0;
-  int transpose = 0;
-  int inverse = 0;
-  int num_act_out = 0;
-};
-
-#endif  //  KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_STRUCTS_H_
diff --git a/kernels/get_indice_pairs/get_indice_pairs_utils.h b/kernels/get_indice_pairs/get_indice_pairs_utils.h
deleted file mode 100644
index a4d0bdd45..000000000
--- a/kernels/get_indice_pairs/get_indice_pairs_utils.h
+++ /dev/null
@@ -1,348 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#ifndef KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_UTILS_H_
-#define KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_UTILS_H_
-
-#include <algorithm>
-
-#include "kernels/get_indice_pairs/normal_get_indice_pairs.h"
-#include "kernels/kernel.h"
-
-#if __BANG_ARCH__ >= 370
-__mlu_func__ void assignTask(const int32_t num_total_task,
-                             const int32_t &taskid, const int32_t &taskdim,
-                             int32_t &task_offset, int32_t &num_cur_task) {
-  int32_t num_per_task = num_total_task / taskdim;
-  int32_t rem_idx = num_total_task % taskdim;
-  if (taskid < rem_idx) {
-    task_offset = taskid * (num_per_task + 1);
-    num_cur_task = num_per_task + 1;
-  } else {
-    task_offset = taskid * num_per_task + rem_idx;
-    num_cur_task = num_per_task;
-  }
-}
-
-/*
-func: init filter index kd * kh * kw
-*/
-__mlu_func__ void genFilterIndex(float *filter_kd_index, float *filter_kh_index,
-                                 float *filter_kw_index, int32_t k_d,
-                                 int32_t k_h, int32_t k_w) {
-  // kw kh, kd loop init
-  int32_t kdhw = k_d * k_h * k_w, khw = k_w * k_h, index_kd_count = 0,
-          index_kh_count = 0;
-  float index_kd = 0, index_kh = 0, index_kw = 0;
-  for (int i = 0; i < kdhw; ++i) {
-    filter_kw_index[i] = index_kw;
-    index_kw++;
-    if (index_kw >= k_w) index_kw = 0.0;
-  }
-  for (int i = 0; i < kdhw; ++i) {
-    filter_kh_index[i] = index_kh;
-    index_kh_count++;
-    if (index_kh_count % k_w == 0) index_kh++;
-    if (index_kh_count % khw == 0) index_kh = 0.0;
-  }
-  for (int i = 0; i < kdhw; ++i) {
-    filter_kd_index[i] = index_kd;
-    index_kd_count++;
-    if (index_kd_count % khw == 0) index_kd++;
-  }
-}
-
-/*
-func: generate stage index from start_index
-*/
-__mlu_func__ void stepIndex(int32_t *dst_nram, int32_t start_index,
-                            int32_t length) {
-#if (__BANG_ARCH__ == 372 || __BANG_ARCH__ == 322 || __BANG_ARCH__ == 592)
-  int32_t align_num = 128;
-  int32_t repeat = (int32_t)(logf(length / align_num) / logf(2));
-  int32_t remain = length / align_num - powf(2, repeat);
-  int32_t global_remain = length % align_num;
-  int32_t count = 1;
-  for (int32_t i = 0; i < align_num; ++i) {
-    dst_nram[i] = i + start_index;
-    if (i == length - 1) {
-      return;
-    }
-  }
-  for (int i = 0; i < repeat; ++i) {
-    __bang_add_scalar((int32_t *)dst_nram + count * align_num,
-                      (int32_t *)dst_nram, count * align_num,
-                      count * align_num);
-    count *= 2;
-  }
-  if (remain > 0) {
-    __bang_add_scalar((int32_t *)dst_nram + count * align_num,
-                      (int32_t *)dst_nram, count * align_num,
-                      remain * align_num);
-  }
-  if (global_remain > 0) {
-    __bang_add_scalar(
-        (int32_t *)dst_nram + count * align_num + remain * align_num,
-        (int32_t *)dst_nram, count * align_num + remain * align_num,
-        global_remain);
-  }
-  __sync();
-#endif
-}
-
-/*
-input: nram_input  l
-output: nram_output k,l
-func: expand k nums input l
-*/
-__mlu_func__ void expandInput(int32_t *nram_input, int32_t deal_num,
-                              int32_t k) {
-  int offset = deal_num;
-  for (int i = 0; i < k; ++i) {
-    __bang_add_scalar((int32_t *)nram_input + offset, (int32_t *)nram_input,
-                      (int32_t)0, deal_num);
-    offset += deal_num;
-  }
-}
-
-/*
-input: input_pos, fliter_pos, stride, padding, dilation
-output: do ho wo
-func: generate do ho wo
-*/
-__mlu_func__ void computeOutputIndex(float *nram_output, float *nram_input,
-                                     float *temp, float *filter_kd_index,
-                                     float *filter_kh_index,
-                                     float *filter_kw_index, int32_t deal_num,
-                                     int32_t kdhw, Stride stride,
-                                     Dilation dilation, Padding padding) {
-  //  formula:  output_id = (input_id + padding - k_id * dilation) / stride
-  float stride_sd = 1.0 / (float)stride.s_d;
-  float stride_sh = 1.0 / (float)stride.s_h;
-  float stride_sw = 1.0 / (float)stride.s_w;
-  int32_t offset = deal_num;
-  for (int i = 0; i < 3; ++i) {
-    int32_t out_offset = offset - deal_num;
-    float stride_s = i == 0 ? stride_sd : i == 1 ? stride_sh : stride_sw;
-    int32_t padding_p =
-        i == 0 ? padding.p_d : i == 1 ? (padding.p_h) : (padding.p_w);
-    int32_t dilation_d =
-        i == 0 ? dilation.d_d : i == 1 ? dilation.d_h : dilation.d_w;
-    float *temp_filter_index =
-        i == 0 ? filter_kd_index : (i == 1 ? filter_kh_index : filter_kw_index);
-    __bang_add_scalar(nram_output + out_offset, nram_input + offset,
-                      (float)(padding_p), deal_num);
-    __bang_mul_scalar(temp, temp_filter_index, (float)(dilation_d), kdhw);
-    __bang_cycle_sub(nram_output + out_offset, nram_output + out_offset, temp,
-                     deal_num, kdhw);
-    __bang_mul_scalar(nram_output + out_offset, nram_output + out_offset,
-                      stride_s, deal_num);
-    offset += deal_num;
-  }
-}
-
-/*
-input: nram_input  float  3 k l do ho wo
-output: nram_output float  k l
-func: generate mask represent output
-*/
-__mlu_func__ void computeMask(float *nram_output, float *nram_input,
-                              float *temp, int32_t deal_num,
-                              OutputSpace output_space) {
-  int32_t o_d = output_space.o_d, o_h = output_space.o_h,
-          o_w = output_space.o_w;
-  int32_t offset = 0;
-  int32_t offset_temp2 = deal_num;
-  int32_t offset_temp3 = 2 * deal_num;
-  __bang_write_value((float *)nram_output, deal_num, (float)1.0);
-  for (int i = 0; i < 3; ++i) {
-    int32_t output_dim = i == 0 ? o_d : i == 1 ? o_h : o_w;
-    __bang_float2int32_tz((int32_t *)temp, (float *)nram_input + offset,
-                          deal_num, 0);
-    __bang_int322float_rn((float *)temp, (int32_t *)temp, deal_num, 0);
-    __bang_sub((float *)temp + offset_temp2, (float *)temp,
-               (float *)nram_input + offset, deal_num);
-    __bang_le_scalar((float *)temp + offset_temp3, (float *)temp + offset_temp2,
-                     (float)0.000001, deal_num);  //  < 1e-6
-    __bang_ge_scalar((float *)temp + offset_temp2, (float *)temp + offset_temp2,
-                     (float)-0.000001, deal_num);  //  > -1e-6
-    __bang_and((float *)temp + offset_temp2, (float *)temp + offset_temp2,
-               (float *)temp + offset_temp3, deal_num);
-    __bang_ge_scalar((float *)temp + offset_temp3, (float *)temp, (float)0.0,
-                     deal_num);
-    __bang_and((float *)temp + offset_temp2, (float *)temp + offset_temp2,
-               (float *)temp + offset_temp3, deal_num);
-    __bang_le_scalar((float *)temp + offset_temp3, (float *)temp,
-                     (float)(output_dim - 1), deal_num);
-    __bang_and((float *)temp, (float *)temp + offset_temp3,
-               (float *)temp + offset_temp2, deal_num);
-    __bang_and((float *)nram_output, (float *)nram_output, (float *)temp,
-               deal_num);
-    offset += deal_num;
-  }
-}
-
-/*
-input: nram_input  int32_t l,4   n do ho wo
-output: nram_output int32_t l indice_out_expand
-func: generate  all_index from  n do ho wo index
-*/
-__mlu_func__ void genIndiceOutput(int32_t *nram_output, float *batch,
-                                  float *nram_input, int32_t *temp,
-                                  int32_t deal_num, OutputSpace output_space) {
-  int32_t o_d = output_space.o_d, o_h = output_space.o_h,
-          o_w = output_space.o_w;
-  int32_t o_hw = o_h * o_w, o_dhw = o_d * o_h * o_w;
-  __bang_float2int32_tz((int32_t *)temp + deal_num, (float *)batch, deal_num,
-                        0);  // n
-  __bang_mul_scalar((int32_t *)temp, (int32_t *)temp + deal_num, (int32_t)o_dhw,
-                    deal_num);  // n * odhw
-  __bang_float2int32_tz((int32_t *)temp + 2 * deal_num, (float *)nram_input,
-                        deal_num, 0);  // do
-  __bang_mul_scalar((int32_t *)temp + deal_num, (int32_t *)temp + 2 * deal_num,
-                    (int32_t)o_hw, deal_num);  // do * o_hw
-  __bang_add((int32_t *)temp, (int32_t *)temp, (int32_t *)temp + deal_num,
-             deal_num);
-  __bang_float2int32_tz((int32_t *)temp + 2 * deal_num,
-                        (float *)nram_input + deal_num, deal_num, 0);  // ho
-  __bang_mul_scalar((int32_t *)temp + deal_num, (int32_t *)temp + 2 * deal_num,
-                    (int32_t)o_w, deal_num);
-  __bang_add((int32_t *)temp, (int32_t *)temp, (int32_t *)temp + deal_num,
-             deal_num);
-  __bang_float2int32_tz((int32_t *)temp + deal_num,
-                        (float *)nram_input + 2 * deal_num, deal_num, 0);  // wo
-  __bang_add((int32_t *)nram_output, (int32_t *)temp,
-             (int32_t *)temp + deal_num, deal_num);
-}
-
-/*
-input: nram_output  int32_t k,l indice_outout_expand
-       mask_all     float   k,l mask_all
-output nram_output  int32_t k,l  indice_output_expand
-func: turn invalid index into int_max
-*/
-__mlu_func__ void genIndiceOutExpand(int32_t *nram_output, int32_t *mask_all,
-                                     int32_t *nram_input, int32_t *temp,
-                                     int32_t deal_num, int32_t output_size) {
-  __bang_mul_scalar((int32_t *)temp, (int32_t *)mask_all, int(-1), deal_num);
-  __bang_band((char *)nram_output, (char *)nram_input, (char *)temp,
-              deal_num * sizeof(int32_t));
-  // clost to intmax
-  __bang_sub_scalar((int32_t *)temp, (int32_t *)mask_all, int(1), deal_num);
-  __bang_mul_scalar((int32_t *)temp, (int32_t *)temp, int(-1 * output_size),
-                    deal_num);
-  __bang_bor((char *)nram_output, (char *)nram_output, (char *)temp,
-             deal_num * sizeof(int32_t));
-}
-
-/*
-input: nram_input  int32_t  L  indice_out_expand
-output: nram_output int32_t  L,4  indice_out
-func:  generate n,do,ho,wo index from input all_index
-limits: imp on 300
-*/
-__mlu_func__ void genIndiceOutLast(int32_t *nram_output, int32_t *nram_input,
-                                   int32_t *nram_aux, OutputSpace output_space,
-                                   int32_t deal_num) {
-#if __BANG_ARCH__ >= 590
-  int32_t o_d = output_space.o_d, o_h = output_space.o_h,
-          o_w = output_space.o_w;
-  int32_t o_hw = o_h * o_w, o_dhw = o_d * o_h * o_w;
-  __bang_div((int32_t *)nram_aux, (int32_t *)nram_input, (int)o_dhw,
-             deal_num);  // n
-  __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux, (int)o_dhw,
-                    deal_num);
-  __bang_sub((int32_t *)nram_input, (int32_t *)nram_input, (int *)nram_output,
-             deal_num);
-  __bang_div((int32_t *)nram_aux + deal_num, (int32_t *)nram_input, (int)o_hw,
-             deal_num);  // d
-  __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux + deal_num,
-                    (int)o_hw, deal_num);
-  __bang_sub((int32_t *)nram_input, (int32_t *)nram_input,
-             (int32_t *)nram_output, deal_num);
-
-  __bang_div((int32_t *)nram_aux + 2 * deal_num, (int32_t *)nram_input,
-             (int)o_w, deal_num);  // h
-  __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux + 2 * deal_num,
-                    (int)o_w, deal_num);
-  __bang_sub((int32_t *)nram_aux + 3 * deal_num, (int32_t *)nram_input,
-             (int32_t *)nram_output, deal_num);  //  w
-  __bang_transpose((int32_t *)nram_output, (int32_t *)nram_aux, 4, deal_num);
-#else
-  int32_t o_d = output_space.o_d, o_h = output_space.o_h,
-          o_w = output_space.o_w;
-  int32_t o_hw = o_h * o_w, o_dhw = o_d * o_h * o_w;
-  __bang_write_value((int32_t *)nram_aux + 4 * deal_num, deal_num, int(o_dhw));
-  __cn_vector_div_s32(deal_num, (int32_t *)nram_aux, (int32_t *)nram_input,
-                      (int32_t *)nram_aux + 4 * deal_num);
-  __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux, (int)o_dhw,
-                    deal_num);
-  __bang_sub((int32_t *)nram_input, (int32_t *)nram_input, (int *)nram_output,
-             deal_num);
-  __bang_write_value((int32_t *)nram_aux + 4 * deal_num, deal_num, int(o_hw));
-  __cn_vector_div_s32(deal_num, (int32_t *)nram_aux + deal_num,
-                      (int32_t *)nram_input,
-                      (int32_t *)nram_aux + 4 * deal_num);
-  __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux + deal_num,
-                    (int)o_hw, deal_num);
-  __bang_sub((int32_t *)nram_input, (int32_t *)nram_input,
-             (int32_t *)nram_output, deal_num);
-
-  __bang_write_value((int32_t *)nram_aux + 4 * deal_num, deal_num, int(o_w));
-  __cn_vector_div_s32(deal_num, (int32_t *)nram_aux + 2 * deal_num,
-                      (int32_t *)nram_input,
-                      (int32_t *)nram_aux + 4 * deal_num);
-  __bang_mul_scalar((int32_t *)nram_output, (int32_t *)nram_aux + 2 * deal_num,
-                    (int)o_w, deal_num);
-  __bang_sub((int32_t *)nram_aux + 3 * deal_num, (int32_t *)nram_input,
-             (int32_t *)nram_output, deal_num);  //  w
-  __bang_transpose((int32_t *)nram_output, (int32_t *)nram_aux, 4, deal_num);
-#endif
-}
-
-/*
-input: nram_input  int32_t l,4   indice_in
-output: nram_output int32_t l   indice_in_expand
-func: generate  all_index from  n di hi wi index
-*/
-__mlu_func__ void genIndiceInExpand(int32_t *nram_output, int32_t *nram_input,
-                                    int32_t *nram_aux, int32_t deal_num,
-                                    InputSpace input_space) {
-  __bang_transpose((int32_t *)nram_aux, (int32_t *)nram_input, deal_num, 4);
-  int32_t i_d = input_space.i_d, i_h = input_space.i_h, i_w = input_space.i_w;
-  int32_t i_hw = i_h * i_w, i_dhw = i_d * i_h * i_w;
-  __bang_mul_scalar((int32_t *)nram_aux + 4 * deal_num,
-                    (int32_t *)nram_aux + 2 * deal_num, int32_t(i_w), deal_num);
-  __bang_add((int32_t *)nram_output, (int32_t *)nram_aux + 4 * deal_num,
-             (int32_t *)nram_aux + 3 * deal_num, deal_num);
-  __bang_mul_scalar((int32_t *)nram_aux + 4 * deal_num,
-                    (int32_t *)nram_aux + deal_num, int32_t(i_hw), deal_num);
-  __bang_add((int32_t *)nram_output, (int32_t *)nram_output,
-             (int32_t *)nram_aux + 4 * deal_num, deal_num);
-  __bang_mul_scalar((int32_t *)nram_aux + 4 * deal_num, (int32_t *)nram_aux,
-                    int32_t(i_dhw), deal_num);
-  __bang_add((int32_t *)nram_output, (int32_t *)nram_output,
-             (int32_t *)nram_aux + 4 * deal_num, deal_num);
-}
-#endif
-#endif  // KERNELS_GET_INDICE_PAIRS_GET_INDICE_PAIRS_UTILS_H_
diff --git a/kernels/get_indice_pairs/normal_get_indice_pairs.cpp b/kernels/get_indice_pairs/normal_get_indice_pairs.cpp
deleted file mode 100644
index 5f96b1053..000000000
--- a/kernels/get_indice_pairs/normal_get_indice_pairs.cpp
+++ /dev/null
@@ -1,1299 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "core/context.h"
-#include "core/logging.h"
-#include "core/mlu_env.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "kernels/get_indice_pairs/get_indice_pairs_structs.h"
-#include "kernels/get_indice_pairs/normal_get_indice_pairs.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-#include "mlu_op.h"
-
-static mluOpStatus_t getIndiceMaskAll(
-    const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume,
-    const int input_active_site, size_t *size) {
-  size_t total_size = 0;
-  total_size =
-      kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype);
-  size[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t getIndiceIndexIn(
-    const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume,
-    const int input_active_site, size_t *size) {
-  size_t total_size = 0;
-  total_size =
-      kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype);
-  size[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t getIndiceIndexOut(
-    const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume,
-    const int input_active_site, size_t *size) {
-  size_t total_size = 0;
-  total_size =
-      kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype);
-  size[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t getIndiceOutExpand(
-    const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume,
-    const int input_active_site, size_t *size) {
-  size_t total_size = 0;
-  total_size =
-      kernel_volume * input_active_site * sizeof(indice_pairs_desc->dtype);
-  size[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t getIndiceInExpand(
-    const mluOpTensorDescriptor_t indice_pairs_desc,
-    const int input_active_site, size_t *size) {
-  size_t total_size = 0;
-  total_size = input_active_site * sizeof(indice_pairs_desc->dtype);
-  size[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t getIndiceUnique(
-    const mluOpTensorDescriptor_t indice_pairs_desc, const int kernel_volume,
-    const int input_active_site, size_t *size) {
-  size_t total_size = 0;
-  total_size = (kernel_volume * input_active_site + 1) *
-               sizeof(indice_pairs_desc->dtype);
-  size[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t getGridOut(const mluOpTensorDescriptor_t indice_pairs_desc,
-                                int output_size, size_t *size) {
-  size_t total_size = 0;
-  total_size = output_size * sizeof(indice_pairs_desc->dtype);
-  size[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t getReduceOpWS(mluOpHandle_t handle,
-                                   const std::string interface_name,
-                                   const int kernel_volume,
-                                   const int input_active_site, size_t *size) {
-  size_t total_size = 0;
-  mluOpTensorDescriptor_t reduce_in_desc, reduce_out_desc;
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&reduce_in_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&reduce_out_desc));
-  std::vector<int> reduce_in_dims = {kernel_volume, input_active_site};
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     mluOpSetTensorDescriptor(
-                         reduce_in_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32,
-                         reduce_in_dims.size(), reduce_in_dims.data()));
-  reduce_in_dims[1] = 1;
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     mluOpSetTensorDescriptor(
-                         reduce_out_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32,
-                         reduce_in_dims.size(), reduce_in_dims.data()));
-  // reduce along lowest dimension
-  int axis[1] = {1};
-  int axis_num = 1;
-  cnnlReduceDescriptor_t reduce_desc;
-  CALL_CNNL(cnnlCreateReduceDescriptor(&reduce_desc));
-  CALL_CNNL(cnnlSetReduceDescriptor(
-      reduce_desc, axis, axis_num, CNNL_REDUCE_ADD,
-      cnnlDataType_t(reduce_in_desc->dtype), CNNL_PROPAGATE_NAN,
-      CNNL_REDUCE_NO_INDICES, CNNL_16BIT_INDICES));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(reduce_in_desc,
-                                                 cnnl_input_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(reduce_out_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlGetReduceOpWorkspaceSize(cnnl_handle, cnnl_input_desc,
-                                           cnnl_output_desc, reduce_desc,
-                                           &total_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(reduce_in_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(reduce_out_desc));
-  CALL_CNNL(cnnlDestroyReduceDescriptor(reduce_desc));
-  size[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t getUniqueOpWS(mluOpHandle_t handle,
-                                   const std::string interface_name,
-                                   const mluOpTensorDescriptor_t indices_desc,
-                                   const int kernel_volume,
-                                   const int input_active_site, size_t *size) {
-  size_t total_size = 0;
-  mluOpTensorDescriptor_t input_unique_desc;
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&input_unique_desc));
-  std::vector<int> unique_in_dims = {kernel_volume * input_active_site};
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(input_unique_desc, MLUOP_LAYOUT_ARRAY,
-                                   MLUOP_DTYPE_INT32, unique_in_dims.size(),
-                                   unique_in_dims.data()));
-
-  {
-    cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND;
-    cnnlUniqueDescriptor_t unique_desc;
-
-    CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc));
-    CALL_CNNL(
-        cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, false, false));
-
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_unique_desc,
-                                                 cnnl_input_desc);
-    CALL_CNNL(cnnlGetUniqueWorkspaceSize(cnnl_handle, unique_desc,
-                                         cnnl_input_desc, &total_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-
-    CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc));
-  }
-
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(input_unique_desc));
-
-  size[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t getNormalGetIndicePairsWorkspaceSize(
-    mluOpHandle_t handle, const std::string interface_name,
-    mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const mluOpTensorDescriptor_t indices_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc,
-    const mluOpTensorDescriptor_t out_indices_desc,
-    const mluOpTensorDescriptor_t indice_num_desc, size_t *return_ws) {
-  // workspace for get_indice_pairs
-  size_t total_size = 0;
-  int sub_m = sparse_conv_desc->sub_m;
-  int batch = sparse_conv_desc->batch;
-  int kernel_volume = indice_pairs_desc->dims[0];
-  int input_active_site = indice_pairs_desc->dims[2];
-  int output_size = batch * sparse_conv_desc->output_space[0] *
-                        sparse_conv_desc->output_space[1] *
-                        sparse_conv_desc->output_space[2] +
-                    1;
-  size_t mask_all_ws = 0, indice_index_in_ws = 0, indice_index_out_ws = 0;
-  size_t out_indices_expand_ws = 0, grid_out_ws = 0, reduce_op_ws = 0;
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     getIndiceMaskAll(indice_pairs_desc, kernel_volume,
-                                      input_active_site, &mask_all_ws));
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     getIndiceIndexIn(indice_pairs_desc, kernel_volume,
-                                      input_active_site, &indice_index_in_ws));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS ==
-          getIndiceIndexOut(indice_pairs_desc, kernel_volume, input_active_site,
-                            &indice_index_out_ws));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS ==
-          getIndiceOutExpand(indice_pairs_desc, kernel_volume,
-                             input_active_site, &out_indices_expand_ws));
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     getGridOut(indice_pairs_desc, output_size, &grid_out_ws));
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     getReduceOpWS(handle, interface_name, kernel_volume,
-                                   input_active_site, &reduce_op_ws));
-  if (sub_m) {
-    /*  workspace for subm mode
-    | mask_all |indices_index_in | indices_index_out/ step_index |
-    indices_in_expand |out_indices_expand| max(grid_out_ws, reduce_op_ws)|
-    */
-    size_t indice_in_expand_ws = 0;
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       getIndiceInExpand(indice_pairs_desc, input_active_site,
-                                         &indice_in_expand_ws));
-    total_size = mask_all_ws + indice_index_in_ws + indice_index_out_ws +
-                 out_indices_expand_ws + indice_in_expand_ws +
-                 std::max(grid_out_ws, reduce_op_ws);
-  } else {
-    /* workspace for default mode
-      | mask_all | indices_index_in | step_index/ indices_index_out |
-      out_indices_expand  | | out_indices_unique | max(grid_out_ws, reduce_ws,
-      unique_ws) |
-    */
-    size_t indice_unique_ws = 0, unique_op_ws = 0;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            getUniqueOpWS(handle, interface_name, indices_desc, kernel_volume,
-                          input_active_site, &unique_op_ws));
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       getIndiceUnique(indice_pairs_desc, kernel_volume,
-                                       input_active_site, &indice_unique_ws));
-    total_size = mask_all_ws + indice_index_in_ws + indice_index_out_ws +
-                 out_indices_expand_ws + indice_unique_ws +
-                 std::max(grid_out_ws, std::max(reduce_op_ws, unique_op_ws));
-  }
-  return_ws[0] = total_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-/* DefaultKernel1
-intput: indices  l,4  int
-output: mask_all  k,l  int
-        indice_index_in k,l int
-        out_indices_expand k,l int
-func:  gen mask_all, indice_index_in, out_indices_expand for next step.
-*/
-mluOpStatus_t launchDefaultKernel1(
-    mluOpHandle_t handle,
-    const mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const void *indices, void *mask_all_ws, void *indice_index_in_ws,
-    void *out_indices_expand_ws, int batch, int kernel_volume,
-    int input_active_site) {
-  cnrtDim3_t kDim3;
-  cnrtFunctionType_t func_type;
-  int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  int cluster_number = mluop::runtime::getClusterLimitCapability(handle);
-  int core_nums = core_dim * cluster_number;
-  int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024;
-  int nums = 19 * kernel_volume + 8;
-  int core_num_l = (nram_size - 4 * 4096 * 3) / nums / sizeof(int);
-  int jobs = (input_active_site + core_num_l - 1) / core_num_l;
-  int job_num = jobs > core_nums ? core_nums : jobs;
-  func_type = CNRT_FUNC_TYPE_BLOCK;
-  kDim3.x = 1;
-  kDim3.y = job_num;
-  kDim3.z = 1;
-  /*  nram_space */
-  // |input| mask_all | indice_index_in | out_indices_expand |  l +  3 k l
-  // |input| mask_all | indice_index_in | out_indices_expand |  l +  3 k l
-  // | nram_aux_a  5 l k | nram_aux_b 8 l k
-  // ping + pong + aux
-  FilterSpace filter_space(sparse_conv_desc->filter_space[0],
-                           sparse_conv_desc->filter_space[1],
-                           sparse_conv_desc->filter_space[2]);
-  InputSpace input_space(sparse_conv_desc->input_space[0],
-                         sparse_conv_desc->input_space[1],
-                         sparse_conv_desc->input_space[2]);
-  OutputSpace output_space(sparse_conv_desc->output_space[0],
-                           sparse_conv_desc->output_space[1],
-                           sparse_conv_desc->output_space[2]);
-  Stride stride(sparse_conv_desc->stride[0], sparse_conv_desc->stride[1],
-                sparse_conv_desc->stride[2]);
-  Dilation dilation(sparse_conv_desc->dilation[0],
-                    sparse_conv_desc->dilation[1],
-                    sparse_conv_desc->dilation[2]);
-  Padding padding(sparse_conv_desc->pad[0], sparse_conv_desc->pad[1],
-                  sparse_conv_desc->pad[2]);
-  VLOG(5) << "[getIndicePairsDefault] Launch kernel "
-             "KernelDefaultGetIndicePairKl1<<<U"
-          << func_type / core_dim << ", " << kDim3.x << ", " << kDim3.y << ", "
-          << kDim3.z << ">>>";
-  CHECK_RETURN(
-      "[getIndicePairsDefault]",
-      KernelDefaultGetIndicePairKl1(
-          kDim3, func_type, handle->queue, (void *)mask_all_ws,
-          (void *)indice_index_in_ws, (void *)out_indices_expand_ws,
-          (void *)indices, filter_space, input_space, output_space, stride,
-          dilation, padding, core_num_l, input_active_site, batch));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-/* SubmKernel1
-intput: indices  l,4  int
-output: mask_all  k,l  int
-        indice_index_in k,l int
-        indice_in_expand l, int
-        out_indices_expand k,l int
-func:  gen mask_all, indice_index_in, indice_in_expand, out_indices_expand for
-next step.
-*/
-mluOpStatus_t launchSubmKernel1(
-    mluOpHandle_t handle,
-    const mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const void *indices, void *mask_all_ptr, void *indice_index_in_ptr,
-    void *indice_in_expand_ptr, void *out_indices_expand_ptr, int batch,
-    int kernel_volume, int input_active_site) {
-  cnrtDim3_t kDim3;
-  cnrtFunctionType_t func_type;
-  int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  int cluster_number = mluop::runtime::getClusterLimitCapability(handle);
-  int core_nums = core_dim * cluster_number;
-  int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024;
-  int nums = 19 * kernel_volume + 10;
-  int core_num_l = (nram_size - 4 * 4096 * 3) / nums / sizeof(int);
-  int jobs = (input_active_site + core_num_l - 1) / core_num_l;
-  int least_jobs = (input_active_site * sizeof(int) + 1024 - 1) / 1024;
-  jobs = std::max(jobs, least_jobs);
-  int job_num = jobs > core_nums ? core_nums : jobs;
-  func_type = CNRT_FUNC_TYPE_BLOCK;
-  kDim3.x = 1;
-  kDim3.y = job_num;
-  kDim3.z = 1;
-  /*  nram_space
-  |input| mask_all | indice_index_in |  out_indices_expand | indice_in_expand
-  |4l + l + 3kl |input| mask_all | indice_index_in |  out_indices_expand |
-  indice_in_expand |4l + l + 3kl | nram_aux_a  5lk | nram_aux_b 8lk |
- */
-  FilterSpace filter_space(sparse_conv_desc->filter_space[0],
-                           sparse_conv_desc->filter_space[1],
-                           sparse_conv_desc->filter_space[2]);
-  InputSpace input_space(sparse_conv_desc->input_space[0],
-                         sparse_conv_desc->input_space[1],
-                         sparse_conv_desc->input_space[2]);
-  OutputSpace output_space(sparse_conv_desc->output_space[0],
-                           sparse_conv_desc->output_space[1],
-                           sparse_conv_desc->output_space[2]);
-  Stride stride(sparse_conv_desc->stride[0], sparse_conv_desc->stride[1],
-                sparse_conv_desc->stride[2]);
-  Dilation dilation(sparse_conv_desc->stride[0], sparse_conv_desc->stride[1],
-                    sparse_conv_desc->stride[2]);
-  Padding padding(sparse_conv_desc->pad[0], sparse_conv_desc->pad[1],
-                  sparse_conv_desc->pad[2]);
-  VLOG(5) << "[getIndicePairsDefault] Launch kernel "
-             "KernelSubmGetIndicePairKl1<<<U"
-          << func_type / core_dim << ", " << kDim3.x << ", " << kDim3.y << ", "
-          << kDim3.z << ">>>";
-  CHECK_RETURN("[getIndicePairsDefault]",
-               KernelSubmGetIndicePairKl1(
-                   kDim3, func_type, handle->queue, (void *)mask_all_ptr,
-                   (void *)indice_index_in_ptr, (void *)indice_in_expand_ptr,
-                   (void *)out_indices_expand_ptr, (void *)indices,
-                   filter_space, input_space, output_space, stride, dilation,
-                   padding, core_num_l, input_active_site, batch));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-/* SubmKernel2
-intput: indices  l,4  int
-        out_indices_expand_ptr  k,l int
-        mask_all_ptr  k,l  int
-output: mask_all  k,l  int
-        out_indices l,4 int
-func:  gen out_indices from indices in subm mode;
-       gen mask_all by and out_indices_expand_ptr/ mask_all_ptr.
-*/
-mluOpStatus_t launchSubmKernel2(mluOpHandle_t handle, const void *indices,
-                                void *out_indices_index_ptr, void *mask_all_ptr,
-                                void *out_indices, int kernel_volume,
-                                int input_active_site) {
-  cnrtDim3_t kDim3;
-  cnrtFunctionType_t func_type;
-  int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  int cluster_number = mluop::runtime::getClusterLimitCapability(handle);
-  int core_nums = core_dim * cluster_number;
-  int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024;
-  int core_num_l_two = (nram_size - 4 * 4096 * 3) / 2 / sizeof(int);
-  int core_num_l_one = (nram_size - 4 * 4096 * 3) / sizeof(int);
-  int len_1_one = input_active_site * 4;
-  int len_l_two = input_active_site * kernel_volume;
-  int jobs_one = (len_1_one + core_num_l_one - 1) / core_num_l_one;
-  int jobs_two = (len_l_two + core_num_l_two - 1) / core_num_l_two;
-  int least_job_one = (len_1_one * sizeof(int) + 1024 - 1) / 1024;
-  int least_job_two = (len_l_two * sizeof(int) + 1024 - 1) / 1024;
-  int least_jobs = std::max(least_job_one, least_job_two);
-  int jobs = std::max(std::max(jobs_one, jobs_two), least_jobs);
-  int job_num = jobs > core_nums ? core_nums : jobs;
-  func_type = CNRT_FUNC_TYPE_BLOCK;
-  kDim3.x = 1;
-  kDim3.y = job_num;
-  kDim3.z = 1;
-  VLOG(5) << "[getIndicePairsDefault] Launch kernel "
-             "KernelSubmGetIndicePairKl2<<<U"
-          << func_type / core_dim << ", " << kDim3.x << ", " << kDim3.y << ", "
-          << kDim3.z << ">>>";
-  CHECK_RETURN(
-      "[getIndicePairsDefault]",
-      KernelSubmGetIndicePairKl2(
-          kDim3, func_type, handle->queue, (void *)out_indices,
-          (void *)mask_all_ptr, (void *)out_indices_index_ptr, (void *)indices,
-          len_1_one, len_l_two, core_num_l_one, core_num_l_two));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-// call reduce op
-mluOpStatus_t launchReduceOp(mluOpHandle_t handle,
-                             const std::string interface_name,
-                             void *reduce_output_addr, void *reduce_input_addr,
-                             void *reduce_workspace_ptr, size_t reduce_op_ws,
-                             int kernel_volume, int input_active_site) {
-  mluOpTensorDescriptor_t reduce_in_desc, reduce_out_desc;
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&reduce_in_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&reduce_out_desc));
-  std::vector<int> reduce_in_dims = {kernel_volume, input_active_site};
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     mluOpSetTensorDescriptor(
-                         reduce_in_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32,
-                         reduce_in_dims.size(), reduce_in_dims.data()));
-  reduce_in_dims[1] = 1;
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     mluOpSetTensorDescriptor(
-                         reduce_out_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32,
-                         reduce_in_dims.size(), reduce_in_dims.data()));
-  // reduce along lowest dimension
-  int axis[1] = {1};
-  int axis_num = 1;
-  cnnlReduceDescriptor_t reduce_desc;
-  CALL_CNNL(cnnlCreateReduceDescriptor(&reduce_desc));
-  CALL_CNNL(cnnlSetReduceDescriptor(
-      reduce_desc, axis, axis_num, CNNL_REDUCE_ADD,
-      cnnlDataType_t(reduce_in_desc->dtype), CNNL_PROPAGATE_NAN,
-      CNNL_REDUCE_NO_INDICES, CNNL_16BIT_INDICES));
-  void *alpha = NULL, *beta = NULL, *indices = NULL;
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(reduce_in_desc,
-                                                 cnnl_input_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(reduce_out_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlReduce(cnnl_handle, reduce_desc, reduce_workspace_ptr,
-                         reduce_op_ws, alpha, cnnl_input_desc,
-                         reduce_input_addr, 0, indices, beta, cnnl_output_desc,
-                         reduce_output_addr));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(reduce_in_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(reduce_out_desc));
-  CALL_CNNL(cnnlDestroyReduceDescriptor(reduce_desc));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-// call unqiue_v2 op
-mluOpStatus_t launchUniqueOp(mluOpHandle_t handle,
-                             const std::string interface_name,
-                             void *unique_output_addr, void *unique_input_addr,
-                             void *unique_output_num_addr,
-                             void *unique_workspace_ptr, size_t unique_op_ws,
-                             int kernel_volume, int input_active_site,
-                             int *return_num_act) {
-  mluOpTensorDescriptor_t unique_input_desc, unique_output_desc;
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&unique_input_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&unique_output_desc));
-  std::vector<int> unique_in_dims = {kernel_volume * input_active_site};
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(unique_input_desc, MLUOP_LAYOUT_ARRAY,
-                                   MLUOP_DTYPE_INT32, unique_in_dims.size(),
-                                   unique_in_dims.data()));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(unique_output_desc, MLUOP_LAYOUT_ARRAY,
-                                   MLUOP_DTYPE_INT32, unique_in_dims.size(),
-                                   unique_in_dims.data()));
-  {
-    cnnlUniqueSort_t unique_mode = CNNL_SORT_ASCEND;
-    cnnlUniqueDescriptor_t unique_desc;
-
-    CALL_CNNL(cnnlCreateUniqueDescriptor(&unique_desc));
-    CALL_CNNL(
-        cnnlSetUniqueDescriptor(unique_desc, unique_mode, 0, false, false));
-
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(unique_input_desc,
-                                                 cnnl_input_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(unique_output_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlUnique_v2(cnnl_handle, unique_desc, cnnl_input_desc,
-                            unique_input_addr, unique_workspace_ptr,
-                            unique_op_ws, (int *)unique_output_num_addr,
-                            cnnl_output_desc, unique_output_addr, nullptr,
-                            nullptr, nullptr, nullptr));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-
-    CALL_CNNL(cnnlDestroyUniqueDescriptor(unique_desc));
-  }
-  cnrtQueueSync(handle->queue);
-  cnrtMemcpy(return_num_act, unique_output_num_addr, sizeof(float),
-             CNRT_MEM_TRANS_DIR_DEV2HOST);
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(unique_input_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(unique_output_desc));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-/*
-DefaultKernel2
-input: num_act_out
-output: step_index
-func: generate tensor incluing 0-num_act_out continuously
-*/
-mluOpStatus_t launchDefaultKernel2(mluOpHandle_t handle,
-                                   void *step_index_output_ptr,
-                                   int num_act_out) {
-  cnrtDim3_t kDim3;
-  cnrtFunctionType_t func_type;
-  int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  int cluster_number = mluop::runtime::getClusterLimitCapability(handle);
-  int core_nums = core_dim * cluster_number;
-  int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024;
-  int core_num_l = (nram_size - 4 * 4096 * 3) / sizeof(int);
-  int jobs = (num_act_out + core_num_l - 1) / core_num_l;
-  int job_num = jobs > core_nums ? core_nums : jobs;
-  func_type = CNRT_FUNC_TYPE_BLOCK;
-  kDim3.x = 1;
-  kDim3.y = job_num;
-  kDim3.z = 1;
-  VLOG(5) << "[getIndicePairsDefault] Launch kernel "
-             "KernelDefaultGetIndicePairKl2<<<U"
-          << func_type / core_dim << ", " << kDim3.x << ", " << kDim3.y << ", "
-          << kDim3.z << ">>>";
-  CHECK_RETURN("[getIndicePairsDefault]",
-               KernelDefaultGetIndicePairKl2(kDim3, func_type, handle->queue,
-                                             step_index_output_ptr, num_act_out,
-                                             core_num_l));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-/*
-BalanceKernel
-input: out_indices_expand_ptr
-mask : mask_all_ptr
-output: out_indices_expand_ptr
-func: balance index distribution
-*/
-mluOpStatus_t launchBalanceKernel(mluOpHandle_t handle,
-                                  const std::string interface_name,
-                                  void *balance_input_addr,
-                                  void *balance_output_addr,
-                                  void *balance_mask_addr,
-                                  int input_active_site, int kernel_volume,
-                                  int output_size) {
-  cnrtDim3_t kDim3;
-  cnrtFunctionType_t func_type;
-  int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  int cluster_number = mluop::runtime::getClusterLimitCapability(handle);
-  int core_nums = core_dim * cluster_number;
-  int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024;
-  int core_num_l = (nram_size - 4 * 4096 * 3) / 8 / sizeof(int);
-  int jobs = (input_active_site * kernel_volume + core_num_l - 1) / core_num_l;
-  int job_num = jobs > core_nums ? core_nums : jobs;
-  func_type = CNRT_FUNC_TYPE_BLOCK;
-  kDim3.x = 1;
-  kDim3.y = job_num;
-  kDim3.z = 1;
-  VLOG(5) << "[getIndicePairsDefault] Launch kernel "
-             "KernelBalanceGetIndicePair<<<U"
-          << func_type / core_dim << ", " << kDim3.x << ", " << kDim3.y << ", "
-          << kDim3.z << ">>>";
-  CHECK_RETURN("[getIndicePairsDefault]",
-               KernelBalanceGetIndicePair(
-                   kDim3, func_type, handle->queue, balance_input_addr,
-                   balance_mask_addr, balance_output_addr, input_active_site,
-                   kernel_volume, core_num_l, output_size));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t launchFillOp(mluOpHandle_t handle,
-                           const std::string interface_name,
-                           void *mluOp_fill_addr, int output_size,
-                           int fill_value) {
-  mluOpTensorDescriptor_t fill_tensor_desc;
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&fill_tensor_desc));
-  std::vector<int> fill_in_dims = {output_size};
-  INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS ==
-                                     mluOpSetTensorDescriptor(
-                                         fill_tensor_desc, MLUOP_LAYOUT_ARRAY,
-                                         MLUOP_DTYPE_INT32, fill_in_dims.size(),
-                                         fill_in_dims.data()));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(fill_tensor_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, mluOp_fill_addr));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(fill_tensor_desc));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-// call scatter_nd op
-mluOpStatus_t launchScatterNdOp(mluOpHandle_t handle,
-                                const std::string interface_name,
-                                void *scatter_output_addr,
-                                void *scatter_input_addr,
-                                void *scatter_indice_addr, int output_size,
-                                int num_act_out) {
-  VLOG(5) << interface_name << " call scatterNd";
-  cnnlScatterNdMode_t scatter_mode = CNNL_SCATTERND_UPDATE;
-  mluOpTensorDescriptor_t scatter_input_desc, scatter_output_desc,
-      scatter_indice_desc;
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&scatter_input_desc));
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     mluOpCreateTensorDescriptor(&scatter_output_desc));
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     mluOpCreateTensorDescriptor(&scatter_indice_desc));
-  std::vector<int> scatter_in_dims = {num_act_out};
-  std::vector<int> scatter_out_dims = {output_size};
-  std::vector<int> scatter_indice_dims = {num_act_out, 1};
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpSetTensorDescriptor(
-                                  scatter_indice_desc, MLUOP_LAYOUT_ARRAY,
-                                  MLUOP_DTYPE_INT32, scatter_indice_dims.size(),
-                                  scatter_indice_dims.data()));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(scatter_input_desc, MLUOP_LAYOUT_ARRAY,
-                                   MLUOP_DTYPE_INT32, scatter_in_dims.size(),
-                                   scatter_in_dims.data()));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(scatter_output_desc, MLUOP_LAYOUT_ARRAY,
-                                   MLUOP_DTYPE_INT32, scatter_out_dims.size(),
-                                   scatter_out_dims.data()));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(scatter_indice_desc,
-                                                 cnnl_indices_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(scatter_input_desc,
-                                                 cnnl_updates_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(scatter_output_desc,
-                                                 cnnl_input_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(scatter_output_desc,
-                                                 cnnl_output_desc);
-
-    CALL_CNNL(cnnlScatterNd_v2(
-        cnnl_handle, scatter_mode, cnnl_indices_desc, scatter_indice_addr,
-        cnnl_updates_desc, scatter_input_addr, cnnl_input_desc,
-        scatter_output_addr, cnnl_output_desc, scatter_output_addr));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_updates_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(scatter_input_desc));
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     mluOpDestroyTensorDescriptor(scatter_output_desc));
-  INTERNAL_CHECK(interface_name,
-                 MLUOP_STATUS_SUCCESS ==
-                     mluOpDestroyTensorDescriptor(scatter_indice_desc));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-// call gather_nd op
-mluOpStatus_t launchGatherNdOp(mluOpHandle_t handle,
-                               const std::string interface_name,
-                               void *gather_input_addr,
-                               void *gather_output_addr,
-                               void *gather_indice_addr, int input_active_site,
-                               int kernel_volume, int output_size) {
-  VLOG(5) << interface_name << " call gatherNd";
-  mluOpTensorDescriptor_t gather_input_desc, gather_output_desc,
-      gather_indice_desc;
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&gather_input_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&gather_output_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpCreateTensorDescriptor(&gather_indice_desc));
-  std::vector<int> gather_in_dims = {output_size};
-  std::vector<int> gather_indices_dims = {input_active_site * kernel_volume, 1};
-  std::vector<int> gather_out_dims = {input_active_site * kernel_volume};
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpSetTensorDescriptor(
-                                  gather_indice_desc, MLUOP_LAYOUT_ARRAY,
-                                  MLUOP_DTYPE_INT32, gather_indices_dims.size(),
-                                  gather_indices_dims.data()));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(gather_input_desc, MLUOP_LAYOUT_ARRAY,
-                                   MLUOP_DTYPE_INT32, gather_in_dims.size(),
-                                   gather_in_dims.data()));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(gather_output_desc, MLUOP_LAYOUT_ARRAY,
-                                   MLUOP_DTYPE_INT32, gather_out_dims.size(),
-                                   gather_out_dims.data()));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_input_desc,
-                                                 cnnl_params_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_indice_desc,
-                                                 cnnl_indices_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_output_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, gather_input_addr,
-                           cnnl_indices_desc, gather_indice_addr,
-                           cnnl_output_desc, gather_output_addr));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(gather_input_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(gather_output_desc));
-  INTERNAL_CHECK(
-      interface_name,
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(gather_indice_desc));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-/* DefaultKernel3
-input: tensor1: kl  int32  indice_index_in
-       tensor2: kl  int32  indice_index_out
-       tensor3: kl  int32  mask
-output: tensor: k2l
-func: maskmove efficient data continuously by collect insts
-*/
-mluOpStatus_t launchDefaultKernel3(mluOpHandle_t handle, void *output_addr,
-                                   void *input_addr, void *mask_addr,
-                                   int input_active_site, int kernel_volume) {
-  cnrtDim3_t kDim3;
-  cnrtFunctionType_t func_type;
-  int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  int cluster_number = mluop::runtime::getClusterLimitCapability(handle);
-  int core_nums = core_dim * cluster_number;
-  int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024;
-  int core_num_l = (nram_size - 4 * 4096 * 3) / 4 / sizeof(int);
-  int jobs = 2 * kernel_volume;
-  int job_num = jobs > core_nums ? core_nums : jobs;
-  func_type = CNRT_FUNC_TYPE_BLOCK;
-  kDim3.x = 1;
-  kDim3.y = job_num;
-  kDim3.z = 1;
-  VLOG(5) << "[getIndicePairsDefault] Launch kernel "
-             "KernelDefaultGetIndicePairKl3<<<U"
-          << func_type / core_dim << ", " << kDim3.x << ", " << kDim3.y << ", "
-          << kDim3.z << ">>>";
-  CHECK_RETURN("[getIndicePairsDefault]",
-               KernelDefaultGetIndicePairKl3(
-                   kDim3, func_type, handle->queue, output_addr, input_addr,
-                   mask_addr, input_active_site, kernel_volume, core_num_l));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-/*
-DefaultKernel4
-input: tensor  num_act_out    int
-output: tensor num_act_out,4  int
-func: generate tensor incluing 0-num_act_out continuously
-*/
-mluOpStatus_t launchDefaultKernel4(
-    mluOpHandle_t handle,
-    const mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    void *output_addr, void *input_addr, int num_act_out) {
-  cnrtDim3_t kDim3;
-  cnrtFunctionType_t func_type;
-  int core_dim = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  int cluster_number = mluop::runtime::getClusterLimitCapability(handle);
-  int core_nums = core_dim * cluster_number;
-  int nram_size = handle->nram_size + REM_FOR_STACK - 12 * 1024;
-  int core_num_split = 0;
-  if (handle->arch >= MLUOP_MLU590) {
-    core_num_split = 14;
-  } else {
-    core_num_split = 15;
-  }
-  int core_num_l = (nram_size - 4 * 4096 * 3) / core_num_split / sizeof(int);
-  int jobs = (num_act_out + core_num_l - 1) / core_num_l;
-  int job_num = jobs > core_nums ? core_nums : jobs;
-  func_type = CNRT_FUNC_TYPE_BLOCK;
-  kDim3.x = 1;
-  kDim3.y = job_num;
-  kDim3.z = 1;
-  OutputSpace output_space(sparse_conv_desc->output_space[0],
-                           sparse_conv_desc->output_space[1],
-                           sparse_conv_desc->output_space[2]);
-
-  VLOG(5) << "[getIndicePairsDefault] Launch kernel "
-             "KernelDefaultGetIndicePairKl4<<<U"
-          << func_type / core_dim << ", " << kDim3.x << ", " << kDim3.y << ", "
-          << kDim3.z << ">>>";
-  CHECK_RETURN("[getIndicePairsDefault]",
-               KernelDefaultGetIndicePairKl4(
-                   kDim3, func_type, handle->queue, output_addr, input_addr,
-                   output_space, num_act_out, core_num_l));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t NormalGetIndicePairsKernel(
-    mluOpHandle_t handle, const std::string interface_name,
-    mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const mluOpTensorDescriptor_t indices_desc, const void *indices,
-    void *workspace, const mluOpTensorDescriptor_t indice_pairs_desc,
-    void *indice_pairs, const mluOpTensorDescriptor_t out_indices_desc,
-    void *out_indices, const mluOpTensorDescriptor_t indice_num_desc,
-    void *indice_num) {
-  int sub_m = sparse_conv_desc->sub_m;
-  int batch = sparse_conv_desc->batch;
-  int kernel_volume = indice_pairs_desc->dims[0];
-  int input_active_site = indice_pairs_desc->dims[2];
-  int output_size = batch * sparse_conv_desc->output_space[0] *
-                        sparse_conv_desc->output_space[1] *
-                        sparse_conv_desc->output_space[2] +
-                    1;
-
-  if (sub_m) {
-    /*  workspace for subm mode
-    | mask_all |indices_index_in | indices_index_out/ step_index |
-    indices_in_expand |out_indices_expand| | max(grid_out, reduce_op_ws)|
-    */
-    size_t mask_all_ws = 0, indice_index_in_ws = 0, indice_index_out_ws = 0;
-    size_t indice_in_expand_ws = 0, out_indices_expand_ws = 0, grid_out_ws = 0;
-    size_t reduce_op_ws = 0;
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       getIndiceMaskAll(indice_pairs_desc, kernel_volume,
-                                        input_active_site, &mask_all_ws));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            getIndiceIndexIn(indice_pairs_desc, kernel_volume,
-                             input_active_site, &indice_index_in_ws));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            getIndiceIndexOut(indice_pairs_desc, kernel_volume,
-                              input_active_site, &indice_index_out_ws));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            getIndiceOutExpand(indice_pairs_desc, kernel_volume,
-                               input_active_site, &out_indices_expand_ws));
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       getIndiceInExpand(indice_pairs_desc, input_active_site,
-                                         &indice_in_expand_ws));
-    INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS ==
-                                       getGridOut(indice_pairs_desc,
-                                                  output_size, &grid_out_ws));
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       getReduceOpWS(handle, interface_name, kernel_volume,
-                                     input_active_site, &reduce_op_ws));
-    const void *compute_indices_ptr = indices;
-    void *mask_all_ptr = (void *)((char *)workspace);
-    void *indice_index_in_ptr = (void *)((char *)workspace + mask_all_ws);
-    void *indice_in_expand_ptr =
-        (void *)((char *)workspace + mask_all_ws + indice_index_in_ws +
-                 indice_index_out_ws);
-    void *out_indices_expand_ptr =
-        (void *)((char *)workspace + mask_all_ws + indice_index_in_ws +
-                 indice_index_out_ws + indice_in_expand_ws);
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchSubmKernel1(handle, sparse_conv_desc, compute_indices_ptr,
-                              mask_all_ptr, indice_index_in_ptr,
-                              indice_in_expand_ptr, out_indices_expand_ptr,
-                              batch, kernel_volume, input_active_site));
-
-    // call launchDefaultKernel2   gen step_index
-    void *step_index_addr = NULL;
-    step_index_addr =
-        (void *)((char *)(char *)workspace + mask_all_ws + indice_index_in_ws);
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchDefaultKernel2(handle, step_index_addr, input_active_site));
-
-    // call scatter_nd unique_output_addr + step_index_addr = grid_out_addr
-    void *scatter_input_addr = NULL, *scatter_output_addr = NULL,
-         *scatter_indice_addr = NULL;
-    scatter_input_addr = step_index_addr;
-    scatter_indice_addr = indice_in_expand_ptr;
-    scatter_output_addr = (void *)((char *)workspace + mask_all_ws +
-                                   indice_index_in_ws + indice_index_out_ws +
-                                   indice_in_expand_ws + out_indices_expand_ws);
-    int fill_value = -1;
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       launchFillOp(handle, interface_name, scatter_output_addr,
-                                    output_size, fill_value));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchScatterNdOp(handle, interface_name, scatter_output_addr,
-                              scatter_input_addr, scatter_indice_addr,
-                              output_size, input_active_site));
-
-    // call gather_nd out_indices_expand + grid_out_addr = indice_index_out
-    void *gather_input_addr = NULL, *gather_output_addr = NULL,
-         *gather_indice_addr = NULL;
-    gather_output_addr =
-        (void *)((char *)workspace + mask_all_ws + indice_index_in_ws);
-    gather_input_addr = scatter_output_addr;
-    gather_indice_addr = out_indices_expand_ptr;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchGatherNdOp(handle, interface_name, gather_input_addr,
-                             gather_output_addr, gather_indice_addr,
-                             input_active_site, kernel_volume, output_size));
-
-    // call sumb_kernel2 indice_index_out and  mask_all = mask_all
-    // get out_indices from indices
-    const void *kernel2_input1_addr = NULL;
-    void *kernel2_input2_addr = NULL, *kernel2_output1_addr = NULL,
-         *kernel2_output2_addr = NULL;
-    kernel2_input1_addr = indices;
-    kernel2_input2_addr = gather_output_addr;
-    kernel2_output1_addr = mask_all_ptr;
-    kernel2_output2_addr = out_indices;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchSubmKernel2(handle, kernel2_input1_addr, kernel2_input2_addr,
-                              kernel2_output1_addr, kernel2_output2_addr,
-                              kernel_volume, input_active_site));
-
-    // call reduceOp
-    void *reduce_input_addr = NULL, *reduce_output_addr = NULL;
-    reduce_input_addr = mask_all_ptr;
-    reduce_output_addr = indice_num;
-    void *reduce_workspace_ptr = NULL;
-    if (reduce_op_ws > 0) {
-      reduce_workspace_ptr =
-          (void *)((char *)workspace + mask_all_ws + indice_index_in_ws +
-                   indice_index_out_ws + indice_in_expand_ws +
-                   out_indices_expand_ws);
-    }
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchReduceOp(handle, interface_name, reduce_output_addr,
-                           reduce_input_addr, reduce_workspace_ptr,
-                           reduce_op_ws, kernel_volume, input_active_site));
-
-    // call launchDefaultKernel3 l k partition and sort
-    void *kernel3_input_addr = NULL, *kernel3_output_addr = NULL,
-         *kernel3_mask_addr = NULL;
-    kernel3_input_addr = indice_index_in_ptr;
-    kernel3_output_addr = indice_pairs;
-    kernel3_mask_addr = mask_all_ptr;
-    fill_value = -1;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchFillOp(handle, interface_name, indice_pairs,
-                         kernel_volume * 2 * input_active_site, fill_value));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchDefaultKernel3(handle, kernel3_output_addr,
-                                 kernel3_input_addr, kernel3_mask_addr,
-                                 input_active_site, kernel_volume));
-  } else {
-    /* workspace for default mode
-    | mask_all | indices_index_in | step_index/ indices_index_out |
-    out_indices_expand  | | out_indices_unique | max(grid_out_ws, reduce_ws,
-    unique_ws) |
-    */
-    size_t mask_all_ws = 0, indice_index_in_ws = 0, indice_index_out_ws = 0;
-    size_t out_indices_expand_ws = 0, indice_unique_ws = 0, grid_out_ws = 0;
-    size_t reduce_op_ws = 0, unique_op_ws = 0;
-
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       getIndiceMaskAll(indice_pairs_desc, kernel_volume,
-                                        input_active_site, &mask_all_ws));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            getIndiceIndexIn(indice_pairs_desc, kernel_volume,
-                             input_active_site, &indice_index_in_ws));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            getIndiceIndexOut(indice_pairs_desc, kernel_volume,
-                              input_active_site, &indice_index_out_ws));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            getIndiceOutExpand(indice_pairs_desc, kernel_volume,
-                               input_active_site, &out_indices_expand_ws));
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       getIndiceUnique(indice_pairs_desc, kernel_volume,
-                                       input_active_site, &indice_unique_ws));
-    INTERNAL_CHECK(interface_name, MLUOP_STATUS_SUCCESS ==
-                                       getGridOut(indice_pairs_desc,
-                                                  output_size, &grid_out_ws));
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       getReduceOpWS(handle, interface_name, kernel_volume,
-                                     input_active_site, &reduce_op_ws));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            getUniqueOpWS(handle, interface_name, indices_desc, kernel_volume,
-                          input_active_site, &unique_op_ws));
-    const void *compute_indices_ptr = indices;
-    void *mask_all_ptr = (void *)((char *)workspace);
-    void *indice_index_in_ptr = (void *)((char *)workspace + mask_all_ws);
-    void *out_indices_expand_ptr =
-        (void *)((char *)workspace + mask_all_ws + indice_index_out_ws +
-                 indice_index_in_ws);
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchDefaultKernel1(handle, sparse_conv_desc, compute_indices_ptr,
-                                 mask_all_ptr, indice_index_in_ptr,
-                                 out_indices_expand_ptr, batch, kernel_volume,
-                                 input_active_site));
-
-    //  call reduce_sum mask_all to indice_num
-    void *reduce_input_addr = NULL, *reduce_output_addr = NULL;
-    reduce_input_addr = mask_all_ptr;
-    reduce_output_addr = indice_num;
-    void *reduce_workspace_ptr = NULL;
-    if (reduce_op_ws > 0) {
-      reduce_workspace_ptr = (void *)((char *)workspace + mask_all_ws +
-                                      indice_index_in_ws + indice_index_out_ws +
-                                      out_indices_expand_ws + indice_unique_ws);
-    }
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchReduceOp(handle, interface_name, reduce_output_addr,
-                           reduce_input_addr, reduce_workspace_ptr,
-                           reduce_op_ws, kernel_volume, input_active_site));
-
-    // call unique_v2 out_indices_expand_ptr indice_unique_ws_ptr
-    int num_act_out = 0;
-    void *unique_input_addr = NULL, *unique_output_addr = NULL,
-         *unique_output_num_addr = NULL;
-    unique_input_addr = out_indices_expand_ptr;
-    unique_output_addr =
-        (void *)((char *)workspace + mask_all_ws + indice_index_in_ws +
-                 indice_index_out_ws + out_indices_expand_ws);
-    unique_output_num_addr =
-        (void *)((char *)workspace + mask_all_ws + indice_index_in_ws);
-    void *unique_workspace_ptr = NULL;
-    if (unique_op_ws > 0) {
-      unique_workspace_ptr = (void *)((char *)workspace + mask_all_ws +
-                                      indice_index_in_ws + indice_index_out_ws +
-                                      out_indices_expand_ws + indice_unique_ws);
-    }
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchUniqueOp(handle, interface_name, unique_output_addr,
-                           unique_input_addr, unique_output_num_addr,
-                           unique_workspace_ptr, unique_op_ws, kernel_volume,
-                           input_active_site, &num_act_out));
-
-    if (num_act_out != kernel_volume * input_active_site) {
-      num_act_out = num_act_out - 1;
-    }
-    if (num_act_out <= 0) {
-      // fill indice_pairs -1 indice_num 0
-      int fill_value = -1;
-      INTERNAL_CHECK(
-          interface_name,
-          MLUOP_STATUS_SUCCESS ==
-              launchFillOp(handle, interface_name, indice_pairs,
-                           kernel_volume * 2 * input_active_site, fill_value));
-      fill_value = 0;
-      INTERNAL_CHECK(interface_name,
-                     MLUOP_STATUS_SUCCESS ==
-                         launchFillOp(handle, interface_name, indice_num,
-                                      kernel_volume, fill_value));
-      return MLUOP_STATUS_SUCCESS;
-    }
-    sparse_conv_desc->num_act_out = num_act_out;
-    // call launchDefaultKernel2   gen step_index
-    void *step_index_addr = NULL;
-    step_index_addr =
-        (void *)((char *)workspace + mask_all_ws + indice_index_in_ws);
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchDefaultKernel2(handle, step_index_addr, num_act_out));
-
-    // call balance out_indices_expand_ptr distr
-    void *balance_input_addr = NULL, *balance_output_addr = NULL,
-         *balance_mask_addr = NULL;
-    balance_input_addr = out_indices_expand_ptr;
-    balance_output_addr = out_indices_expand_ptr;
-    balance_mask_addr = mask_all_ptr;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchBalanceKernel(handle, interface_name, balance_input_addr,
-                                balance_output_addr, balance_mask_addr,
-                                input_active_site, kernel_volume, output_size));
-
-    // call scatter_nd unique_output_addr + step_index_addr = grid_out_addr
-    void *scatter_input_addr = NULL, *scatter_output_addr = NULL,
-         *scatter_indice_addr = NULL;
-    scatter_input_addr = step_index_addr;
-    scatter_indice_addr = unique_output_addr;
-    scatter_output_addr = (void *)((char *)workspace + mask_all_ws +
-                                   indice_index_in_ws + indice_index_out_ws +
-                                   out_indices_expand_ws + indice_unique_ws);
-    int fill_value = -1;
-    INTERNAL_CHECK(interface_name,
-                   MLUOP_STATUS_SUCCESS ==
-                       launchFillOp(handle, interface_name, scatter_output_addr,
-                                    output_size, fill_value));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchScatterNdOp(handle, interface_name, scatter_output_addr,
-                              scatter_input_addr, scatter_indice_addr,
-                              output_size, num_act_out));
-
-    // call gather_nd out_indices_expand + grid_out_addr = indice_index_out
-    void *gather_input_addr = NULL, *gather_output_addr = NULL,
-         *gather_indice_addr = NULL;
-    gather_output_addr =
-        (void *)((char *)workspace + mask_all_ws + indice_index_in_ws);
-    gather_input_addr = scatter_output_addr;
-    gather_indice_addr = out_indices_expand_ptr;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchGatherNdOp(handle, interface_name, gather_input_addr,
-                             gather_output_addr, gather_indice_addr,
-                             input_active_site, kernel_volume, output_size));
-
-    // call launchDefaultKernel3 l k partition and sort
-    void *kernel3_input_addr = NULL, *kernel3_output_addr = NULL,
-         *kernel3_mask_addr = NULL;
-    kernel3_input_addr = indice_index_in_ptr;
-    kernel3_output_addr = indice_pairs;
-    kernel3_mask_addr = mask_all_ptr;
-    fill_value = -1;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchFillOp(handle, interface_name, indice_pairs,
-                         kernel_volume * 2 * input_active_site, fill_value));
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchDefaultKernel3(handle, kernel3_output_addr,
-                                 kernel3_input_addr, kernel3_mask_addr,
-                                 input_active_site, kernel_volume));
-
-    // get out_indices from indice unique
-    void *kernel4_output_addr = NULL, *kernel4_input_addr = NULL;
-    kernel4_input_addr = unique_output_addr;
-    kernel4_output_addr = out_indices;
-    INTERNAL_CHECK(
-        interface_name,
-        MLUOP_STATUS_SUCCESS ==
-            launchDefaultKernel4(handle, sparse_conv_desc, kernel4_output_addr,
-                                 kernel4_input_addr, num_act_out));
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t normalGetIndicePairs(
-    mluOpHandle_t handle, const std::string interface_name,
-    mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const mluOpTensorDescriptor_t indices_desc, const void *indices,
-    void *workspace, size_t workspace_size,
-    const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs,
-    const mluOpTensorDescriptor_t out_indices_desc, void *out_indices,
-    const mluOpTensorDescriptor_t indice_num_desc, void *indice_num,
-    const bool is_get_workspace, size_t *return_ws) {
-  if (is_get_workspace) {
-    return getNormalGetIndicePairsWorkspaceSize(
-        handle, interface_name, sparse_conv_desc, indices_desc,
-        indice_pairs_desc, out_indices_desc, indice_num_desc, return_ws);
-  } else {
-    return NormalGetIndicePairsKernel(
-        handle, interface_name, sparse_conv_desc, indices_desc, indices,
-        workspace, indice_pairs_desc, indice_pairs, out_indices_desc,
-        out_indices, indice_num_desc, indice_num);
-  }
-}
diff --git a/kernels/get_indice_pairs/normal_get_indice_pairs.h b/kernels/get_indice_pairs/normal_get_indice_pairs.h
deleted file mode 100644
index a48a57b11..000000000
--- a/kernels/get_indice_pairs/normal_get_indice_pairs.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_GET_INDICE_PAIRS_NORMAL_GET_INDICE_PAIRS_H_
-#define KERNELS_GET_INDICE_PAIRS_NORMAL_GET_INDICE_PAIRS_H_
-
-#include <string>
-
-#include "kernels/get_indice_pairs/get_indice_pairs_structs.h"
-#include "mlu_op.h"
-
-mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl1(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *mask_all_ws, void *indice_index_in_ws, void *indice_out_expand_ws,
-    void *indices, FilterSpace filter_space, InputSpace input_space,
-    OutputSpace output_space, Stride stride, Dilation dilation, Padding padding,
-    int32_t core_num_l, int32_t input_active_site, int32_t batch_size);
-
-mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl2(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *step_index_ptr, int32_t num_act_out, int32_t core_num_l);
-
-mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl3(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *indice_pairs, void *input_addr, void *mask_addr,
-    int32_t input_active_site, int32_t kernel_volume, int32_t core_num_l);
-
-mluOpStatus_t MLUOP_WIN_API KernelDefaultGetIndicePairKl4(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *out_indices, void *input_addr, OutputSpace host_output_space,
-    int32_t len_l, int32_t core_num_l);
-
-mluOpStatus_t MLUOP_WIN_API KernelBalanceGetIndicePair(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *balance_input, void *balance_mask, void *balance_output,
-    int32_t len_l, int32_t kernel_volume, int32_t core_num_l,
-    int32_t output_size);
-
-mluOpStatus_t MLUOP_WIN_API KernelSubmGetIndicePairKl1(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *mask_all_ptr, void *indice_index_in_ptr, void *indice_in_expand_ptr,
-    void *out_indices_expand_ptr, void *indices, FilterSpace host_filter_space,
-    InputSpace host_input_space, OutputSpace host_output_space,
-    Stride host_stride, Dilation host_dilation, Padding host_padding,
-    int32_t core_num_l, int32_t input_active_site, int32_t batch_size);
-
-mluOpStatus_t MLUOP_WIN_API KernelSubmGetIndicePairKl2(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    void *out_indices, void *mask_all_ptr, void *out_indices_expand_ptr,
-    void *indices, int32_t len_1_one, int32_t len_l_two, int32_t core_num_l_one,
-    int32_t core_num_l_two);
-
-mluOpStatus_t getNormalGetIndicePairsWorkspaceSize(
-    mluOpHandle_t handle, const std::string interface_name,
-    const mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const mluOpTensorDescriptor_t indices_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc,
-    const mluOpTensorDescriptor_t out_indices_desc,
-    const mluOpTensorDescriptor_t indice_num_desc, size_t *return_ws);
-
-mluOpStatus_t normalGetIndicePairs(
-    mluOpHandle_t handle, const std::string interface_name,
-    const mluOpSparseConvolutionDescriptor_t sparse_conv_desc,
-    const mluOpTensorDescriptor_t indices_desc, const void *indices,
-    void *workspace, size_t workspace_size,
-    const mluOpTensorDescriptor_t indice_pairs_desc, void *indice_pairs,
-    const mluOpTensorDescriptor_t out_indices_desc, void *out_indices,
-    const mluOpTensorDescriptor_t indice_num_desc, void *indice_num,
-    const bool is_get_workspace, size_t *return_ws);
-#endif  // KERNELS_GET_INDICE_PAIRS_NORMAL_GET_INDICE_PAIRS_H_
diff --git a/kernels/indice_convolution_backward_data/indice_convolution_backward_data.cpp b/kernels/indice_convolution_backward_data/indice_convolution_backward_data.cpp
deleted file mode 100644
index ea0860f90..000000000
--- a/kernels/indice_convolution_backward_data/indice_convolution_backward_data.cpp
+++ /dev/null
@@ -1,904 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/indice_convolution_backward_data/indice_convolution_backward_data.h"
-
-#include <algorithm>
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "kernels/get_indice_pairs/get_indice_pairs_structs.h"
-#include "kernels/utils/cnnl_helper.h"
-#include "mlu_op.h"
-
-static mluOpStatus_t foolCheckNoPtr(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc,
-    const mluOpTensorDescriptor_t filters_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc, const int64_t indice_num[],
-    const int64_t inverse, const int64_t sub_m,
-    const mluOpTensorDescriptor_t input_grad_desc, bool *is_zero) {
-  std::string api = "[mluOpIndiceConvolutionBackwardData]";
-  // check nullptr
-  PARAM_CHECK(api, handle != NULL);
-  PARAM_CHECK(api, output_grad_desc != NULL);
-  PARAM_CHECK(api, filters_desc != NULL);
-  PARAM_CHECK(api, indice_pairs_desc != NULL);
-  PARAM_CHECK(api, input_grad_desc != NULL);
-
-  // check platform
-  if (handle->arch < 372) {
-    LOG(ERROR) << api << " Only support hardware over MLU300 .";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-
-  // check dim
-  PARAM_CHECK_EQ(api, output_grad_desc->dim, 2);
-  PARAM_CHECK(api, filters_desc->dim == 4 || filters_desc->dim == 5);
-  PARAM_CHECK_EQ(api, indice_pairs_desc->dim, 3);
-  PARAM_CHECK_EQ(api, input_grad_desc->dim, 2);
-
-  // check shape
-  PARAM_CHECK(api, indice_pairs_desc->dims[1] == 2);
-  if (indice_pairs_desc->dims[2] > INDICE_IN_LARGE_TENSOR_NUM) {
-    LOG(ERROR) << api << " Check failed: "
-               << "indice_pairs_desc->dims[2] cannot be greater than "
-               << INDICE_IN_LARGE_TENSOR_NUM << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  // check dtype
-  PARAM_CHECK(api, output_grad_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                       output_grad_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK(api, filters_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                       filters_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK(api, input_grad_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                       input_grad_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK(api, indice_pairs_desc->dtype == MLUOP_DTYPE_INT32);
-
-  // check layout
-  bool layout_check = filters_desc->layout == MLUOP_LAYOUT_NHWC ||
-                      filters_desc->layout == MLUOP_LAYOUT_NCHW ||
-                      filters_desc->layout == MLUOP_LAYOUT_HWCN ||
-                      filters_desc->layout == MLUOP_LAYOUT_NCDHW ||
-                      filters_desc->layout == MLUOP_LAYOUT_NDHWC ||
-                      filters_desc->layout == MLUOP_LAYOUT_ARRAY;
-  if (!layout_check) {
-    LOG(ERROR) << api
-               << " The filters tensor only supports "
-                  "NHWC/NCHW/HWCN/NCDHW/NDHWC/ARRAY layout.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // get filters params
-  int kd = 1, kh = 1, kw = 1, dyc = 1, dxc = 1;
-  if (filters_desc->layout != MLUOP_LAYOUT_ARRAY) {
-    kh = mluOpGetTensordimH(filters_desc);
-    kw = mluOpGetTensordimW(filters_desc);
-    dyc = mluOpGetTensordimN(filters_desc);
-    dxc = mluOpGetTensordimC(filters_desc);
-    if (filters_desc->dim == 5) {
-      kd = mluOpGetTensordimD(filters_desc);
-    }
-  } else {
-    if (filters_desc->dim == 5) {
-      kd = filters_desc->dims[0];
-    }
-    int _dim = filters_desc->dim;
-    kh = filters_desc->dims[_dim - 4];
-    kw = filters_desc->dims[_dim - 3];
-    dxc = filters_desc->dims[_dim - 2];
-    dyc = filters_desc->dims[_dim - 1];
-  }
-  int K = kd * kh * kw;
-
-  // check param
-  PARAM_CHECK(api, inverse == 0 || inverse == 1);
-  PARAM_CHECK(api, sub_m == 0 || sub_m == 1);
-  for (int kk = 0; kk < K; ++kk) {
-    PARAM_CHECK(api, indice_num[kk] >= 0);
-  }
-  if (inverse == 1) {
-    LOG(ERROR) << api << " Not support inverse == 1 yet.";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  // check algorithm, relationship between params
-  if (K != indice_pairs_desc->dims[0]) {
-    LOG(ERROR) << api
-               << " The dims[0] of indice_pairs should be equal to the "
-                  "multiple of kd, kh and kw.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (output_grad_desc->dims[1] != dyc) {
-    LOG(ERROR) << api
-               << " The dims[1] of output_grad should be equal to dyc of "
-                  "filters tensor.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (input_grad_desc->dims[1] != dxc) {
-    LOG(ERROR) << api
-               << " The dims[1] of input_grad should be equal to dxc of "
-                  "filters tensor.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (input_grad_desc->dims[0] != indice_pairs_desc->dims[2]) {
-    LOG(ERROR) << api
-               << " The dims[0] of input_grad should be equal to the dims[2] "
-                  "of indice_pairs.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  int max_indice_num = getMaxNumInArray(indice_num, K);
-
-  if (indice_pairs_desc->dims[2] < max_indice_num) {
-    VLOG(5) << "indice_pairs_desc->dims[2] " << indice_pairs_desc->dims[2]
-            << " max_indice_num " << max_indice_num;
-    LOG(ERROR) << api
-               << " The data in indice_num array should be smaller or equal to"
-               << " the dims[2] of indice_pairs.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (sub_m == 1) {
-    if (input_grad_desc->dims[0] != output_grad_desc->dims[0]) {
-      LOG(ERROR) << api
-                 << " The dims[0] of input_grad should be equal to the dims[0]"
-                 << " of output_grad when sub_m is 1.";
-      return MLUOP_STATUS_BAD_PARAM;
-    }
-
-    if (indice_num[K / 2] < max_indice_num) {
-      LOG(ERROR) << api
-                 << " The middle number of the indice_num array should be the "
-                 << "maximum of the array when sub_m is 1. Now the maximum is "
-                 << max_indice_num << " while the middle number of the array "
-                 << "is " << indice_num[K / 2] << ".";
-      return MLUOP_STATUS_BAD_PARAM;
-    }
-  }
-
-  if (output_grad_desc->dims[0] < max_indice_num) {
-    LOG(ERROR)
-        << api
-        << " The dims[0] of output_grad should be larger than or equal to the"
-        << " maximum number of indice_num.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  if (sub_m == 1 && K % 2 == 0) {
-    LOG(ERROR) << api << " When sub_m value is 1, the filters dims (Kd, Kh & "
-               << "Kw) should be odd numbers.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  PARAM_CHECK(api, output_grad_desc->dtype == input_grad_desc->dtype);
-  PARAM_CHECK(api, output_grad_desc->dtype == filters_desc->dtype);
-
-  // check constraints: not support large tensor
-  uint64_t input_grad_count = mluOpGetTensorElementNum(input_grad_desc);
-  TENSOR_NUM_CHECK(api, input_grad_count, LARGE_TENSOR_NUM,
-                   "input_grad tensor num is too large. ");
-  uint64_t output_grad_count = mluOpGetTensorElementNum(output_grad_desc);
-  TENSOR_NUM_CHECK(api, output_grad_count, LARGE_TENSOR_NUM,
-                   "output_grad tensor num is too large. ");
-  uint64_t filter_count = mluOpGetTensorElementNum(filters_desc);
-  TENSOR_NUM_CHECK(api, filter_count, LARGE_TENSOR_NUM,
-                   "filters tensor num is too large. ");
-  uint64_t indice_pairs_count = mluOpGetTensorElementNum(indice_pairs_desc);
-  TENSOR_NUM_CHECK(api, indice_pairs_count, LARGE_TENSOR_NUM,
-                   "indice_pairs tensor num is too large. ");
-
-  // check zero element
-  if (input_grad_count == 0) {
-    LOG(INFO) << "input_grad is a zero-element tensor.";
-    *is_zero = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if (output_grad_count == 0) {
-    LOG(INFO) << "output_grad is a zero-element tensor.";
-    *is_zero = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if (filter_count == 0) {
-    LOG(INFO) << "filters is a zero-element tensor.";
-    *is_zero = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if (indice_pairs_count == 0) {
-    LOG(INFO) << "indice_pairs is a zero-element tensor.";
-    *is_zero = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static void getPermuteArray(const mluOpTensorLayout_t filter_layout,
-                            int *permute) {
-  // transpose to (D)HWCN, (kd-)kh-kw-dxc-dyc
-  switch (filter_layout) {
-    case MLUOP_LAYOUT_NHWC: {
-      permute[0] = 1;
-      permute[1] = 2;
-      permute[2] = 3;
-      permute[3] = 0;
-    }; break;
-    case MLUOP_LAYOUT_NCHW: {
-      permute[0] = 2;
-      permute[1] = 3;
-      permute[2] = 1;
-      permute[3] = 0;
-    }; break;
-    case MLUOP_LAYOUT_NDHWC: {
-      permute[0] = 1;
-      permute[1] = 2;
-      permute[2] = 3;
-      permute[3] = 4;
-      permute[4] = 0;
-    }; break;
-    case MLUOP_LAYOUT_NCDHW: {
-      permute[0] = 2;
-      permute[1] = 3;
-      permute[2] = 4;
-      permute[3] = 1;
-      permute[4] = 0;
-    }; break;
-    case MLUOP_LAYOUT_HWCN:
-    default:
-      break;
-  }
-}
-
-static mluOpStatus_t foolCheck(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc,
-    const void *output_grad, const mluOpTensorDescriptor_t filters_desc,
-    const void *filters, const mluOpTensorDescriptor_t indice_pairs_desc,
-    const void *indice_pairs, const int64_t indice_num[], const int64_t inverse,
-    const int64_t sub_m, void *workspace, size_t workspace_size,
-    const mluOpTensorDescriptor_t input_grad_desc, void *input_grad,
-    bool *is_zero) {
-  std::string api = "[mluOpIndiceConvolutionBackwardData]";
-  mluOpStatus_t ret =
-      foolCheckNoPtr(handle, output_grad_desc, filters_desc, indice_pairs_desc,
-                     indice_num, inverse, sub_m, input_grad_desc, is_zero);
-  if (ret != MLUOP_STATUS_SUCCESS) {
-    return ret;
-  }
-  if (*is_zero) {
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // check workspace & other space
-  PARAM_CHECK(api, output_grad != NULL);
-  PARAM_CHECK(api, filters != NULL);
-  PARAM_CHECK(api, indice_pairs != NULL);
-  PARAM_CHECK(api, input_grad != NULL);
-  if (workspace_size > 0) {
-    PARAM_CHECK(api, workspace != NULL);
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static void spconvbpdataGencase(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc,
-    const void *output_grad, const mluOpTensorDescriptor_t filters_desc,
-    const void *filters, const mluOpTensorDescriptor_t indice_pairs_desc,
-    const void *indice_pairs, const int64_t indice_num[], const int64_t inverse,
-    const int64_t sub_m, void *workspace, size_t workspace_size,
-    const mluOpTensorDescriptor_t input_grad_desc, void *input_grad) {
-  GEN_CASE_START("indice_convolution_backward_data");
-  GEN_CASE_HANDLE(handle);
-  GEN_CASE_DATA_REAL(true, "output_grad", output_grad, output_grad_desc);
-  GEN_CASE_DATA_REAL(true, "filters", filters, filters_desc);
-  GEN_CASE_DATA_REAL(true, "indice_pairs_desc", indice_pairs,
-                     indice_pairs_desc);
-  GEN_CASE_DATA_REAL(false, "input_grad", input_grad, input_grad_desc);
-  GEN_CASE_OP_PARAM_SINGLE(0, "indice_convolution_backward_data", "inverse",
-                           inverse);
-  GEN_CASE_OP_PARAM_SINGLE(1, "indice_convolution_backward_data", "sub_m",
-                           sub_m);
-  GEN_CASE_OP_PARAM_ARRAY(1, "indice_convolution_backward_data", "indice_num",
-                          indice_num, indice_pairs_desc->dims[0]);
-  GEN_CASE_HANDLE_PARAM();
-  GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-}
-
-/*
- *   [output_grad]              [filters]
- *         |                       |
- *         | cnnlGatherNd()        | cnnlTranspose_v2()
- *         |                       |
- *         V                       V
- * [output_grad_condence]       [filter_transpose]
- *         |_______________________|
- *                     |
- *                     | cnnlMatMul_v2()
- *                     |
- *                     V
- *           [input_grad_condence]
- *                     |
- *                     | cnnlScatterNd_v2(CNNL_SCATTERND_UPDATE)
- *                     |
- *                     V
- *           [workspace_input_grad_tmp]
- *                     |
- *                     | cnnlAddN_v2()
- *                     |
- *                     V
- *               [input_grad]
- */
-mluOpStatus_t MLUOP_WIN_API mluOpGetIndiceConvolutionBackwardDataWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc,
-    const mluOpTensorDescriptor_t filters_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc,
-    const mluOpTensorDescriptor_t input_grad_desc, const int64_t indice_num[],
-    const int64_t inverse, size_t *workspace_size) {
-  bool is_zero_element = false;
-  if (workspace_size == NULL) {
-    LOG(ERROR) << "[mluOpGetIndiceConvolutionBackwardDataWorkspaceSize] "
-               << "The pointer workspace_size should not be nullptr.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  mluOpStatus_t ret =
-      foolCheckNoPtr(handle, output_grad_desc, filters_desc, indice_pairs_desc,
-                     indice_num, inverse, 0, input_grad_desc, &is_zero_element);
-  if (ret != MLUOP_STATUS_SUCCESS) {
-    return ret;
-  }
-  if (is_zero_element) {
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  int kd = 1, kh = 1, kw = 1, dyc = 1, dxc = 1;
-  if (filters_desc->layout != MLUOP_LAYOUT_ARRAY) {
-    kh = mluOpGetTensordimH(filters_desc);
-    kw = mluOpGetTensordimW(filters_desc);
-    dyc = mluOpGetTensordimN(filters_desc);
-    dxc = mluOpGetTensordimC(filters_desc);
-    if (filters_desc->dim == 5) {
-      kd = mluOpGetTensordimD(filters_desc);
-    }
-  } else {
-    if (filters_desc->dim == 5) {
-      kd = filters_desc->dims[0];
-    }
-    int _dim = filters_desc->dim;
-    kh = filters_desc->dims[_dim - 4];
-    kw = filters_desc->dims[_dim - 3];
-    dxc = filters_desc->dims[_dim - 2];
-    dyc = filters_desc->dims[_dim - 1];
-  }
-  int K = kd * kh * kw;
-  int max_indice_num = getMaxNumInArray(indice_num, K);
-  uint64_t filter_transpose_size = 0;
-  uint64_t transpose_workspace_size = 0;
-  uint64_t output_grad_condence_size = 0;
-  uint64_t input_grad_condence_size = 0;
-  uint64_t matmul_workspace_size = 0;
-  if (!(filters_desc->layout == MLUOP_LAYOUT_HWCN ||
-        filters_desc->layout == MLUOP_LAYOUT_ARRAY)) {
-    filter_transpose_size = mluOpGetTensorElementNum(filters_desc) *
-                            mluOpDataTypeBytes(filters_desc->dtype);
-    // get cnnlTranspose_v2 workspace workspace_size
-    size_t transpose_workspace_size_ = 0;
-    cnnlTransposeDescriptor_t trans_desc;
-    CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc));
-    int permute[5] = {0, 1, 2, 3, 4};
-    getPermuteArray(filters_desc->layout, permute);
-    CALL_CNNL(
-        cnnlSetTransposeDescriptor(trans_desc, filters_desc->dim, permute));
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filters_desc, cnnl_x_desc);
-      CALL_CNNL(cnnlGetTransposeWorkspaceSize(
-          cnnl_handle, cnnl_x_desc, trans_desc, &transpose_workspace_size_));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc));
-    transpose_workspace_size = (uint64_t)transpose_workspace_size_;
-  }
-  output_grad_condence_size = max_indice_num * output_grad_desc->dims[1] *
-                              mluOpDataTypeBytes(filters_desc->dtype);
-  input_grad_condence_size = max_indice_num * input_grad_desc->dims[1] *
-                             mluOpDataTypeBytes(filters_desc->dtype);
-
-  // matmul workspace
-  {
-    mluOpTensorDescriptor_t sub_filters_desc;
-    mluOpTensorDescriptor_t output_grad_condence_desc;
-    mluOpTensorDescriptor_t input_grad_condence_desc;
-
-    cnnlMatMulDescriptor_t cnnl_matmul_desc;
-    cnnlMatMulHeuristicResult_t cnnl_heuristic_result;
-    cnnlMatMulAlgo_t cnnl_matmul_algo;
-
-    MLUOP_CHECK(mluOpCreateTensorDescriptor(&sub_filters_desc));
-    // MLUOP_CHECK(mluOpCreateTensorDescriptor(&sub_filters_desc));
-    int sub_filter_dims[2] = {(int)(dxc), (int)(dyc)};
-    MLUOP_CHECK(mluOpSetTensorDescriptor(sub_filters_desc, MLUOP_LAYOUT_ARRAY,
-                                         filters_desc->dtype, 2,
-                                         sub_filter_dims));
-    int is_trans_a = 0, is_trans_b = 1;
-    int tf32_flag_int = 0;
-    CALL_CNNL(cnnlMatMulDescCreate(&cnnl_matmul_desc));
-    CALL_CNNL(cnnlSetMatMulDescAttr(cnnl_matmul_desc, CNNL_MATMUL_DESC_TRANSA,
-                                    &(is_trans_a), sizeof(is_trans_a)));
-    CALL_CNNL(cnnlSetMatMulDescAttr(cnnl_matmul_desc, CNNL_MATMUL_DESC_TRANSB,
-                                    &(is_trans_b), sizeof(is_trans_b)));
-    CALL_CNNL(cnnlSetMatMulDescAttr(cnnl_matmul_desc, CNNL_MATMUL_ALLOW_TF32,
-                                    &(tf32_flag_int), sizeof(tf32_flag_int)));
-    MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_grad_condence_desc));
-    int output_grad_condence_dims[2] = {(int)(max_indice_num), (int)(dyc)};
-    MLUOP_CHECK(mluOpSetTensorDescriptor(
-        output_grad_condence_desc, MLUOP_LAYOUT_ARRAY, output_grad_desc->dtype,
-        2, output_grad_condence_dims));
-    MLUOP_CHECK(mluOpCreateTensorDescriptor(&input_grad_condence_desc));
-    int input_grad_condence_dims[2] = {(int)(max_indice_num), (int)(dxc)};
-    MLUOP_CHECK(mluOpSetTensorDescriptor(
-        input_grad_condence_desc, MLUOP_LAYOUT_ARRAY, input_grad_desc->dtype, 2,
-        input_grad_condence_dims));
-
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sub_filters_desc,
-                                                 cnnl_sub_filters_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(
-        output_grad_condence_desc, cnnl_output_grad_condence_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc,
-                                                 cnnl_input_grad_condence_desc);
-
-    CALL_CNNL(cnnlCreateMatMulHeuristicResult(&cnnl_heuristic_result));
-    CALL_CNNL(cnnlMatMulAlgoCreate(&cnnl_matmul_algo));
-
-    // set matmul heuristic_result & algorithm
-    int requested_algo_count = 1, return_algo_count = 0;
-    CALL_CNNL(cnnlGetMatMulAlgoHeuristic(
-        cnnl_handle, cnnl_matmul_desc, cnnl_output_grad_condence_desc,
-        cnnl_sub_filters_desc, cnnl_input_grad_condence_desc,
-        cnnl_input_grad_condence_desc, NULL, requested_algo_count,
-        &cnnl_heuristic_result, &return_algo_count));
-
-    // launch matmul
-    size_t workspace_size_matmul = 0;
-    float alpha_gemm = 1.0f, beta_gemm = 0.0f;
-    CALL_CNNL(cnnlGetMatMulHeuristicResult(
-        cnnl_heuristic_result, cnnl_matmul_algo, &workspace_size_matmul));
-
-    // destroy descriptors
-    CALL_CNNL(cnnlDestroyMatMulHeuristicResult(cnnl_heuristic_result));
-    CALL_CNNL(cnnlMatMulDescDestroy(cnnl_matmul_desc));
-    CALL_CNNL(cnnlMatMulAlgoDestroy(cnnl_matmul_algo));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_grad_condence_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_sub_filters_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_grad_condence_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-
-    MLUOP_CHECK(mluOpDestroyTensorDescriptor(output_grad_condence_desc));
-    MLUOP_CHECK(mluOpDestroyTensorDescriptor(sub_filters_desc));
-    MLUOP_CHECK(mluOpDestroyTensorDescriptor(input_grad_condence_desc));
-    matmul_workspace_size = (uint64_t)workspace_size_matmul;
-  }
-  // scatter to input_grad_tmp_workspace_size workspace
-  uint64_t input_grad_tmp_workspace_size =
-      mluOpGetTensorElementNum(input_grad_desc) *
-      mluOpDataTypeBytes(input_grad_desc->dtype);
-
-  // addn workspace
-  uint32_t addn_num = 2;
-  size_t addn_workspace_size = 0;
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    cnnlTensorDescriptor_t *cnnl_input_descs = (cnnlTensorDescriptor_t *)malloc(
-        sizeof(cnnlTensorDescriptor_t) * addn_num);
-    for (int i = 0; i < addn_num; i++) {
-      CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc,
-                                            cnnl_input_descs[i]);
-    }
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc,
-                                                 cnnl_output_desc);
-    CHECK_FUNC_RETURN(
-        cnnlGetAddNWorkspaceSize(cnnl_handle, cnnl_input_descs, addn_num,
-                                 cnnl_output_desc, &addn_workspace_size),
-        CNNL_STATUS_SUCCESS,
-        "[cnnlAddN_v2] Internal error accured in cnnlGetAddNWorkspaceSize.",
-        MLUOP_STATUS_INTERNAL_ERROR);
-    for (int i = 0; i < addn_num; i++) {
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_descs[i]);
-    }
-    free(cnnl_input_descs);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-
-  *workspace_size =
-      (size_t)(filter_transpose_size + transpose_workspace_size +
-               output_grad_condence_size + input_grad_condence_size +
-               matmul_workspace_size + input_grad_tmp_workspace_size +
-               addn_workspace_size);
-  VLOG(5) << "[mluOpIndiceConvolutionBackwardData] filter_transpose_size="
-          << filter_transpose_size
-          << ", transpose_workspace_size=" << transpose_workspace_size
-          << ", output_grad_condence_size=" << output_grad_condence_size
-          << ", input_grad_condence_size=" << input_grad_condence_size
-          << ", matmul_workspace_size=" << matmul_workspace_size
-          << ", input_grad_tmp_workspace_size=" << input_grad_tmp_workspace_size
-          << ", addn_workspace_size=" << addn_workspace_size;
-  VLOG(5) << "[mluOpIndiceConvolutionBackwardData] workspace workspace_size: "
-          << *workspace_size;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardData(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t output_grad_desc,
-    const void *output_grad, const mluOpTensorDescriptor_t filters_desc,
-    const void *filters, const mluOpTensorDescriptor_t indice_pairs_desc,
-    const void *indice_pairs, const int64_t indice_num[], const int64_t inverse,
-    const int64_t sub_m, void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t input_grad_desc, void *input_grad) {
-  // fool check
-  {
-    bool is_zero_element = false;
-    mluOpStatus_t ret = foolCheck(
-        handle, output_grad_desc, output_grad, filters_desc, filters,
-        indice_pairs_desc, indice_pairs, indice_num, inverse, sub_m, workspace,
-        workspace_size, input_grad_desc, input_grad, &is_zero_element);
-    if (ret != MLUOP_STATUS_SUCCESS) {
-      return ret;
-    }
-    if (is_zero_element) {
-      return MLUOP_STATUS_SUCCESS;
-    }
-  }
-
-  // gen_case
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    spconvbpdataGencase(handle, output_grad_desc, output_grad, filters_desc,
-                        filters, indice_pairs_desc, indice_pairs, indice_num,
-                        inverse, sub_m, workspace, workspace_size,
-                        input_grad_desc, input_grad);
-  }
-
-  // get filters params
-  int kd = 1, kh = 1, kw = 1, dyc = 1, dxc = 1;
-  if (filters_desc->layout != MLUOP_LAYOUT_ARRAY) {
-    kh = mluOpGetTensordimH(filters_desc);
-    kw = mluOpGetTensordimW(filters_desc);
-    dyc = mluOpGetTensordimN(filters_desc);
-    dxc = mluOpGetTensordimC(filters_desc);
-    if (filters_desc->dim == 5) {
-      kd = mluOpGetTensordimD(filters_desc);
-    }
-  } else {
-    if (filters_desc->dim == 5) {
-      kd = filters_desc->dims[0];
-    }
-    int _dim = filters_desc->dim;
-    kh = filters_desc->dims[_dim - 4];
-    kw = filters_desc->dims[_dim - 3];
-    dxc = filters_desc->dims[_dim - 2];
-    dyc = filters_desc->dims[_dim - 1];
-  }
-  int K = kd * kh * kw;
-  int cal_dwidth = mluOpDataTypeBytes(filters_desc->dtype);
-  uint64_t filter_transpose_size = 0, output_grad_condence_size = 0,
-           input_grad_condence_size = 0;
-  if (!(filters_desc->layout == MLUOP_LAYOUT_HWCN)) {
-    filter_transpose_size = mluOpGetTensorElementNum(filters_desc) * cal_dwidth;
-    VLOG(5) << "host invoke: filter_transpose_size " << filter_transpose_size;
-  }
-  output_grad_condence_size =
-      getMaxNumInArray(indice_num, K) * output_grad_desc->dims[1] * cal_dwidth;
-  input_grad_condence_size =
-      getMaxNumInArray(indice_num, K) * input_grad_desc->dims[1] * cal_dwidth;
-  char *filter_transpose = (char *)filters;
-  char *workspace_base = (char *)workspace;
-
-  // transpose filters to layout XHWCN
-  mluOpTensorDescriptor_t filter_transpose_desc;
-  if (filters_desc->layout != MLUOP_LAYOUT_HWCN &&
-      filters_desc->layout != MLUOP_LAYOUT_ARRAY) {
-    filter_transpose = (char *)workspace;
-    workspace_base += filter_transpose_size;
-    cnnlTransposeDescriptor_t trans_desc;
-    MLUOP_CHECK(mluOpCreateTensorDescriptor(&filter_transpose_desc));
-    CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc));
-    int permute[5] = {0, 1, 2, 3, 4};
-    int filter_transpose_dims[5];
-    getPermuteArray(filters_desc->layout, permute);
-    for (int i = 0; i < filters_desc->dim; ++i) {
-      filter_transpose_dims[i] = filters_desc->dims[permute[i]];
-      VLOG(5) << "permute " << permute[i];
-    }
-    MLUOP_CHECK(mluOpSetTensorDescriptor(
-        filter_transpose_desc, MLUOP_LAYOUT_ARRAY, filters_desc->dtype,
-        filters_desc->dim, filter_transpose_dims));
-    CALL_CNNL(
-        cnnlSetTransposeDescriptor(trans_desc, filters_desc->dim, permute));
-    size_t transpose_workspace_size = 0;
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filters_desc, cnnl_x_desc);
-      CALL_CNNL(cnnlGetTransposeWorkspaceSize(
-          cnnl_handle, cnnl_x_desc, trans_desc, &transpose_workspace_size));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    char *transpose_workspace = workspace_base;
-    workspace_base += transpose_workspace_size;
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filters_desc, cnnl_x_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_transpose_desc,
-                                                   cnnl_y_desc);
-      CALL_CNNL(cnnlTranspose_v2(
-          cnnl_handle, trans_desc, cnnl_x_desc, filters, cnnl_y_desc,
-          filter_transpose, transpose_workspace, transpose_workspace_size));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc));
-    MLUOP_CHECK(mluOpDestroyTensorDescriptor(filter_transpose_desc));
-  } else {
-    filter_transpose_desc = filters_desc;
-  }
-  char *output_grad_condence = workspace_base;
-  workspace_base += output_grad_condence_size;
-  char *input_grad_condence = workspace_base;
-  workspace_base += input_grad_condence_size;
-
-  // filters calculate desc
-  mluOpTensorDescriptor_t sub_filters_desc;
-  MLUOP_CHECK(mluOpCreateTensorDescriptor(&sub_filters_desc));
-  int sub_filter_dims[2] = {(int)(dxc), (int)(dyc)};
-  MLUOP_CHECK(mluOpSetTensorDescriptor(sub_filters_desc, MLUOP_LAYOUT_ARRAY,
-                                       filters_desc->dtype, 2,
-                                       sub_filter_dims));
-  float fill_value = 0;
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc,
-                                               cnnl_output_desc);
-  CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                        cnnl_output_desc, input_grad));
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-
-  void *workspace_matmul = NULL;
-  char *workspace_input_grad_tmp = NULL;
-  char *workspace_addn = NULL;
-
-  // filters DHW dim loop
-  int kk_count = 0;
-  for (size_t kk = 0; kk < K; ++kk) {
-    VLOG(5) << "indice_num " << indice_num[kk];
-    if (indice_num[kk] == 0) {
-      continue;
-    }
-    const int int_dwidth = 4;
-    char *sub_filter = filter_transpose + kk * dyc * dxc * cal_dwidth;
-
-    // gather output_grad
-    mluOpTensorDescriptor_t gather_indices_desc;
-    mluOpTensorDescriptor_t output_grad_condence_desc;
-    MLUOP_CHECK(mluOpCreateTensorDescriptor(&gather_indices_desc));
-    int gather_indices_dims[2] = {(int)(indice_num[kk]), (int)(1)};
-    MLUOP_CHECK(mluOpSetTensorDescriptor(gather_indices_desc,
-                                         MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_INT32,
-                                         2, gather_indices_dims));
-    MLUOP_CHECK(mluOpCreateTensorDescriptor(&output_grad_condence_desc));
-    int output_grad_condence_dims[2] = {(int)(indice_num[kk]), (int)(dyc)};
-    MLUOP_CHECK(mluOpSetTensorDescriptor(
-        output_grad_condence_desc, MLUOP_LAYOUT_ARRAY, output_grad_desc->dtype,
-        2, output_grad_condence_dims));
-    uint64_t gather_indices_offset =
-        (kk * 2 + 1) * int(indice_pairs_desc->dims[2]) * int_dwidth;
-    char *gather_indices =
-        (char *)(const_cast<void *>(indice_pairs)) + gather_indices_offset;
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_desc,
-                                                   cnnl_params_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_indices_desc,
-                                                   cnnl_indices_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_condence_desc,
-                                                   cnnl_output_desc);
-      CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, output_grad,
-                             cnnl_indices_desc, gather_indices,
-                             cnnl_output_desc, output_grad_condence));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-
-    // matmul
-    cnnlMatMulDescriptor_t matmul_desc;
-    int is_trans_a = 0, is_trans_b = 1;
-    int tf32_flag_int = 0;
-    CALL_CNNL(cnnlMatMulDescCreate(&matmul_desc));
-    CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSA,
-                                    &(is_trans_a), sizeof(is_trans_a)));
-    CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSB,
-                                    &(is_trans_b), sizeof(is_trans_b)));
-    CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_ALLOW_TF32,
-                                    &(tf32_flag_int), sizeof(tf32_flag_int)));
-    mluOpTensorDescriptor_t input_grad_condence_desc;
-    MLUOP_CHECK(mluOpCreateTensorDescriptor(&input_grad_condence_desc));
-    int input_grad_condence_dims[2] = {(int)(indice_num[kk]), (int)(dxc)};
-    MLUOP_CHECK(mluOpSetTensorDescriptor(
-        input_grad_condence_desc, MLUOP_LAYOUT_ARRAY, input_grad_desc->dtype, 2,
-        input_grad_condence_dims));
-    cnnlMatMulHeuristicResult_t heuristic_result;
-    CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result));
-    cnnlMatMulAlgo_t matmul_algo;
-    CALL_CNNL(cnnlMatMulAlgoCreate(&matmul_algo));
-
-    // set matmul heuristic_result & algorithm
-    int requested_algo_count = 1, return_algo_count = 0;
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_condence_desc,
-                                                   cnnl_a_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sub_filters_desc,
-                                                   cnnl_b_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc,
-                                                   cnnl_c_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc,
-                                                   cnnl_d_desc);
-      CALL_CNNL(cnnlGetMatMulAlgoHeuristic(
-          cnnl_handle, matmul_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc,
-          cnnl_d_desc, NULL, requested_algo_count, &heuristic_result,
-          &return_algo_count));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-
-    // launch matmul
-    size_t workspace_size_matmul = 0;
-    float alpha_gemm = 1.0f, beta_gemm = 0.0f;
-    CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo,
-                                           &workspace_size_matmul));
-    if (kk_count == 0) {
-      workspace_matmul = workspace_size_matmul == 0
-                             ? NULL
-                             : reinterpret_cast<void *>(workspace_base);
-      workspace_base += workspace_size_matmul;
-    }
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_condence_desc,
-                                                   cnnl_a_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sub_filters_desc,
-                                                   cnnl_b_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc,
-                                                   cnnl_c_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc,
-                                                   cnnl_d_desc);
-      CALL_CNNL(cnnlMatMul_v2(
-          cnnl_handle, matmul_desc, matmul_algo, &alpha_gemm, cnnl_a_desc,
-          output_grad_condence, cnnl_b_desc, sub_filter, &beta_gemm,
-          cnnl_c_desc, input_grad_condence, workspace_matmul,
-          workspace_size_matmul, cnnl_d_desc, input_grad_condence));
-
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    // destroy descriptors
-    CALL_CNNL(cnnlDestroyMatMulHeuristicResult(heuristic_result));
-    CALL_CNNL(cnnlMatMulDescDestroy(matmul_desc));
-    CALL_CNNL(cnnlMatMulAlgoDestroy(matmul_algo));
-
-    // fill workspace_input_grad_tmp
-    uint64_t input_grad_tmp_workspace_size =
-        mluOpGetTensorElementNum(input_grad_desc) *
-        mluOpDataTypeBytes(input_grad_desc->dtype);
-    if (kk_count == 0) {
-      workspace_input_grad_tmp = workspace_base;
-      workspace_base += input_grad_tmp_workspace_size;
-    }
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, workspace_input_grad_tmp));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-
-    // scatter input_grad
-    uint64_t scatter_indices_offset =
-        (kk * 2) * int(indice_pairs_desc->dims[2]) * int_dwidth;
-    char *scatter_indices =
-        (char *)(const_cast<void *>(indice_pairs)) + scatter_indices_offset;
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(gather_indices_desc,
-                                                   cnnl_indices_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_condence_desc,
-                                                   cnnl_updates_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc,
-                                                   cnnl_input_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc,
-                                                   cnnl_output_desc);
-
-      CALL_CNNL(cnnlScatterNd_v2(cnnl_handle, CNNL_SCATTERND_UPDATE,
-                                 cnnl_indices_desc, scatter_indices,
-                                 cnnl_updates_desc, input_grad_condence,
-                                 cnnl_input_desc, workspace_input_grad_tmp,
-                                 cnnl_output_desc, workspace_input_grad_tmp));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_updates_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-
-    // add workspace_input_grad_tmp tensor back to input_grad
-    if (kk_count == 0) {
-      workspace_addn = workspace_base;
-    }
-    void *addn_array[2] = {reinterpret_cast<void *>(workspace_input_grad_tmp),
-                           input_grad};
-    size_t addn_workspace_size = 0;
-    uint32_t addn_num = 2;
-
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      cnnlTensorDescriptor_t *cnnl_input_descs =
-          (cnnlTensorDescriptor_t *)malloc(sizeof(cnnlTensorDescriptor_t) *
-                                           addn_num);
-      for (int i = 0; i < addn_num; i++) {
-        CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc,
-                                              cnnl_input_descs[i]);
-      }
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_grad_desc,
-                                                   cnnl_output_desc);
-      CHECK_FUNC_RETURN(
-          cnnlGetAddNWorkspaceSize(cnnl_handle, cnnl_input_descs, addn_num,
-                                   cnnl_output_desc, &addn_workspace_size),
-          CNNL_STATUS_SUCCESS,
-          "[cnnlAddN_v2] Internal error accured in cnnlGetAddNWorkspaceSize.",
-          MLUOP_STATUS_INTERNAL_ERROR);
-
-      CALL_CNNL(cnnlAddN_v2(cnnl_handle, cnnl_input_descs, addn_array, addn_num,
-                            cnnl_output_desc, input_grad, workspace_addn,
-                            addn_workspace_size));
-      for (int i = 0; i < addn_num; i++) {
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_descs[i]);
-      }
-      free(cnnl_input_descs);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-
-    MLUOP_CHECK(mluOpDestroyTensorDescriptor(input_grad_condence_desc));
-    MLUOP_CHECK(mluOpDestroyTensorDescriptor(gather_indices_desc));
-    MLUOP_CHECK(mluOpDestroyTensorDescriptor(output_grad_condence_desc));
-    kk_count++;
-  }
-  MLUOP_CHECK(mluOpDestroyTensorDescriptor(sub_filters_desc));
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h b/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h
deleted file mode 100644
index e8be23270..000000000
--- a/kernels/indice_convolution_backward_data/indice_convolution_backward_data.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*******************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS self.tcp LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *******************************************************************************/
-#ifndef KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_  // NOLINT
-#define KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_  // NOLINT
-
-#include "core/tensor.h"
-
-inline int getMaxNumInArray(const int64_t arr[], const int num) {
-  int max_num = (int)(arr[0]);
-  for (int i = 1; i < num; ++i) {
-    max_num = max_num > (int)(arr[i]) ? max_num : (int)(arr[i]);
-  }
-  return max_num;
-}
-
-#endif  // KERNELS_INDICE_CONVOLUTION_BACKWARD_DATA_INDICE_CONVOLUTION_BACKWARD_DATA_H_  // NOLINT
diff --git a/kernels/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp b/kernels/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp
deleted file mode 100644
index ee33eb38d..000000000
--- a/kernels/indice_convolution_backward_filter/indice_convolution_backward_filter.cpp
+++ /dev/null
@@ -1,605 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <algorithm>
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/mlu_env.h"
-#include "core/tensor.h"
-#include "kernels/get_indice_pairs/get_indice_pairs_structs.h"
-#include "kernels/utils/cnnl_helper.h"
-#include "mlu_op.h"
-
-inline bool isFloatDtype(const mluOpDataType_t &dtype) {
-  return (dtype == MLUOP_DTYPE_HALF || dtype == MLUOP_DTYPE_FLOAT);
-}
-
-inline mluOpDataType_t getOnchipDataType(
-    const mluOpTensorDescriptor_t tensor_desc) {
-  if (tensor_desc->onchip_dtype != MLUOP_DTYPE_INVALID) {
-    return tensor_desc->onchip_dtype;
-  } else {
-    return tensor_desc->dtype;
-  }
-}
-
-inline mluOpStatus_t setMatmulDescInfo(const std::string api_name,
-                                       cnnlMatMulDescriptor_t matmul_desc,
-                                       const uint32_t is_trans_a_value,
-                                       const uint32_t is_trans_b_value,
-                                       const uint32_t compute_dtype,
-                                       const uint32_t allow_tf32) {
-  CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSA,
-                                  &is_trans_a_value, sizeof(int32_t)));
-  CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSB,
-                                  &is_trans_b_value, sizeof(int32_t)));
-  CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_COMPUTE_TYPE,
-                                  &compute_dtype, sizeof(int32_t)));
-  CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_ALLOW_TF32,
-                                  &(allow_tf32), sizeof(int32_t)));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-inline std::string getTensorShapeString(const mluOpTensorDescriptor_t desc) {
-  std::string res;
-  res.push_back('[');
-  for (int32_t i = 0; i < desc->dim - 1; i++) {
-    res.append(std::to_string(desc->dims[i]) + ',');
-  }
-  res.append(std::to_string(desc->dims[desc->dim - 1]) + ']');
-  return res;
-}
-
-static void indiceConvFilterGencase(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc,
-    const void *features, const mluOpTensorDescriptor_t output_grad_desc,
-    const void *output_grad, const mluOpTensorDescriptor_t indice_pairs_desc,
-    const void *indice_pairs, const int64_t indice_num[], const int64_t inverse,
-    const int64_t subm, void *workspace, size_t workspace_size,
-    const mluOpTensorDescriptor_t filters_grad_desc, void *filters_grad) {
-  GEN_CASE_START("indice_convolution_backward_filter");
-  GEN_CASE_HANDLE(handle);
-  GEN_CASE_DATA_REAL(true, "features", features, features_desc);
-  GEN_CASE_DATA_REAL(true, "output_grad", output_grad, output_grad_desc);
-  GEN_CASE_DATA_REAL(true, "indice_pairs_desc", indice_pairs,
-                     indice_pairs_desc);
-  GEN_CASE_DATA_REAL(false, "diff_w", filters_grad, filters_grad_desc);
-  GEN_CASE_OP_PARAM_SINGLE(0, "indice_convolution_backward", "inverse",
-                           inverse);
-  GEN_CASE_OP_PARAM_SINGLE(1, "indice_convolution_backward", "subm", subm);
-  GEN_CASE_OP_PARAM_ARRAY(1, "indice_convolution_backward", "indice_num",
-                          indice_num, indice_pairs_desc->dims[0]);
-  GEN_CASE_HANDLE_PARAM();
-  GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-}
-
-// check input and diffy
-static mluOpStatus_t indiceConvDtypeVaild(
-    const std::string api_name, const mluOpTensorDescriptor_t features_desc,
-    const mluOpTensorDescriptor_t output_grad_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc,
-    const mluOpTensorDescriptor_t filters_grad_desc) {
-  auto input_dtype = features_desc->dtype;
-  auto diffy_dtype = output_grad_desc->dtype;
-  auto filters_grad_dtype = filters_grad_desc->dtype;
-  auto pairs_dtype = indice_pairs_desc->dtype;
-  if (pairs_dtype != MLUOP_DTYPE_INT32) {
-    LOG(ERROR) << api_name
-               << " indice_pairs_desc only supports data type int32. "
-               << "But now the data type is "
-               << mluOpGetNameOfDataType(pairs_dtype) << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  if (input_dtype != diffy_dtype || input_dtype != filters_grad_dtype ||
-      !isFloatDtype(input_dtype) || !isFloatDtype(diffy_dtype) ||
-      !isFloatDtype(filters_grad_dtype)) {
-    LOG(ERROR)
-        << api_name << " The data type of features_desc, output_grad_desc "
-        << "and filters_grad_desc should be the same and the three should "
-        << "be either half or float. But now the data types are "
-        << mluOpGetNameOfDataType(input_dtype) << "-"
-        << mluOpGetNameOfDataType(diffy_dtype) << "-"
-        << mluOpGetNameOfDataType(filters_grad_dtype) << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  auto input_on_dtype = features_desc->onchip_dtype;
-  auto diffy_on_dtype = output_grad_desc->onchip_dtype;
-  auto filters_grad_on_dtype = filters_grad_desc->onchip_dtype;
-  auto pairs_on_dtype = indice_pairs_desc->onchip_dtype;
-  if ((MLUOP_DTYPE_INVALID != input_on_dtype &&
-       input_on_dtype != input_dtype) ||
-      (MLUOP_DTYPE_INVALID != diffy_on_dtype &&
-       diffy_on_dtype != diffy_dtype) ||
-      (MLUOP_DTYPE_INVALID != pairs_on_dtype &&
-       pairs_on_dtype != pairs_dtype)) {
-    LOG(ERROR) << api_name
-               << " For features_desc, output_grad_desc and indice_pairs_desc, "
-               << "there is no need to set the on-chip data type, and if so, "
-               << "it needs to be the same as their off-chip data type. "
-               << "But now two data types of features_desc are "
-               << mluOpGetNameOfDataType(input_dtype) << "-"
-               << mluOpGetNameOfDataType(input_on_dtype)
-               << ", output_grad_desc are "
-               << mluOpGetNameOfDataType(diffy_dtype) << "-"
-               << mluOpGetNameOfDataType(diffy_on_dtype)
-               << ", and indice_pairs_desc are "
-               << mluOpGetNameOfDataType(pairs_dtype) << "-"
-               << mluOpGetNameOfDataType(pairs_on_dtype) << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  if ((filters_grad_on_dtype != MLUOP_DTYPE_INVALID &&
-       !isFloatDtype(filters_grad_on_dtype)) ||
-      (filters_grad_dtype == MLUOP_DTYPE_FLOAT &&
-       filters_grad_on_dtype == MLUOP_DTYPE_HALF)) {
-    LOG(ERROR) << api_name << " The on-chip data type of filters_grad_desc "
-               << "may not be set, if it is set, only half or float types are "
-               << "supported, and the bit width of on-chip data type can not "
-               << "be smaller than that of off-chip data type. But now two "
-               << "data types of filters_grad_desc are "
-               << mluOpGetNameOfDataType(filters_grad_dtype) << "-"
-               << mluOpGetNameOfDataType(filters_grad_on_dtype) << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t baseParamCheck(
-    const std::string api_name, mluOpHandle_t handle,
-    const mluOpTensorDescriptor_t features_desc,
-    const mluOpTensorDescriptor_t output_grad_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc,
-    const mluOpTensorDescriptor_t filters_grad_desc, const int64_t indice_num[],
-    const int64_t inverse) {
-  PARAM_CHECK(api_name, handle != nullptr);
-  PARAM_CHECK(api_name, features_desc != nullptr);
-  PARAM_CHECK(api_name, output_grad_desc != nullptr);
-  PARAM_CHECK(api_name, indice_pairs_desc != nullptr);
-  PARAM_CHECK(api_name, filters_grad_desc != nullptr);
-  PARAM_CHECK(api_name, indice_num != nullptr);
-  PARAM_CHECK(api_name, inverse == 0);
-
-  // check mlu platform
-  if (handle->arch < 372) {
-    LOG(ERROR) << api_name << " Only mlu300 and above devices are supported."
-               << " Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-
-  // check data type
-  auto dtype_check =
-      indiceConvDtypeVaild(api_name, features_desc, output_grad_desc,
-                           indice_pairs_desc, filters_grad_desc);
-  if (MLUOP_STATUS_SUCCESS != dtype_check) {
-    return dtype_check;
-  }
-
-  if (mluOpGetTensorElementNum(features_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(output_grad_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(indice_pairs_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(filters_grad_desc) >= LARGE_TENSOR_NUM) {
-    LOG(ERROR) << api_name << " Overflow max tensor num."
-               << " Currently, MLU-OPS supports tensor num smaller than 2^31.";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  bool shape_check = true;
-  if (2 != features_desc->dim || 2 != output_grad_desc->dim ||
-      3 != indice_pairs_desc->dim ||
-      (4 != filters_grad_desc->dim && 5 != filters_grad_desc->dim)) {
-    shape_check = false;  // dimension check failed!
-  }
-
-  // only DHWCN/HWCN layout of filter_grad is supported, currently
-  int32_t filter_dim_len = filters_grad_desc->dim;
-  auto ci = filters_grad_desc->dims[filter_dim_len - 2];
-  auto co = filters_grad_desc->dims[filter_dim_len - 1];
-  auto kd = filter_dim_len == 4 ? 1 : filters_grad_desc->dims[0];
-  auto kh = filter_dim_len == 4 ? filters_grad_desc->dims[0]
-                                : filters_grad_desc->dims[1];
-  auto kw = filter_dim_len == 4 ? filters_grad_desc->dims[1]
-                                : filters_grad_desc->dims[2];
-  if (ci != features_desc->dims[1] || co != output_grad_desc->dims[1] ||
-      features_desc->dims[0] != indice_pairs_desc->dims[2] ||
-      2 != indice_pairs_desc->dims[1] ||
-      kd * kh * kw != indice_pairs_desc->dims[0]) {
-    shape_check = false;  // interdependent dimension check failed!
-  }
-  PARAM_CHECK_LE(api_name, indice_pairs_desc->dims[2],
-                 INDICE_IN_LARGE_TENSOR_NUM);
-
-  if (!shape_check) {
-    LOG(ERROR) << api_name << " Shape check failed! "
-               << "Now the shapes are features_desc"
-               << getTensorShapeString(features_desc) << ", output_grad_desc"
-               << getTensorShapeString(output_grad_desc)
-               << ", indice_pairs_desc"
-               << getTensorShapeString(indice_pairs_desc)
-               << ", and filters_grad_desc"
-               << getTensorShapeString(filters_grad_desc) << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t insertTranspose(
-    const std::string api_name, mluOpHandle_t handle,
-    const mluOpTensorDescriptor_t filters_grad_desc,
-    const void *filters_grad_temp, void *filters_grad_buffer, void *workspace,
-    size_t *size, const bool is_get_workspace, const int32_t kernel_volume,
-    const int32_t ci, const int32_t co) {
-  int32_t trans_in_shape[3] = {kernel_volume, ci, co};
-  int32_t trans_out_shape[3] = {co, kernel_volume, ci};  // NHWC or NDHWC
-  int32_t permute[3] = {2, 0, 1};
-  if (MLUOP_LAYOUT_NCHW == filters_grad_desc->layout ||
-      MLUOP_LAYOUT_NCDHW == filters_grad_desc->layout) {
-    trans_out_shape[0] = co;
-    trans_out_shape[1] = ci;
-    trans_out_shape[2] = kernel_volume;
-    permute[0] = 2;
-    permute[1] = 1;
-    permute[2] = 0;
-  }
-
-  size_t transpose_workspace = 0;
-  mluOpTensorDescriptor_t trans_in_desc, trans_out_desc;
-  cnnlTransposeDescriptor_t trans_desc;
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_in_desc));
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_out_desc));
-  CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc));
-  CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                             trans_in_desc, MLUOP_LAYOUT_ARRAY,
-                             filters_grad_desc->dtype, 3, trans_in_shape));
-  CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                             trans_out_desc, MLUOP_LAYOUT_ARRAY,
-                             filters_grad_desc->dtype, 3, trans_out_shape));
-  CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, 3, permute));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_in_desc, cnnl_x_desc);
-    CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_x_desc,
-                                            trans_desc, &transpose_workspace));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  if (is_get_workspace) {  // is get workspace
-    *size = transpose_workspace;
-  } else {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_in_desc, cnnl_x_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_out_desc, cnnl_y_desc);
-    CALL_CNNL(cnnlTranspose_v2(
-        cnnl_handle, trans_desc, cnnl_x_desc, filters_grad_temp, cnnl_y_desc,
-        filters_grad_buffer, workspace, transpose_workspace));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(trans_in_desc));
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(trans_out_desc));
-  CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-// called by getWorkspace and compute api
-// workspace_size is not nullptr when it's from getWorkspace api.
-static mluOpStatus_t internalIndiceConvBackwardFilter(
-    const std::string api_name, mluOpHandle_t handle,
-    const mluOpTensorDescriptor_t features_desc, const void *features,
-    const mluOpTensorDescriptor_t output_grad_desc, const void *output_grad,
-    const mluOpTensorDescriptor_t indice_pairs_desc, const void *indice_pairs,
-    const int64_t indice_num[], void *workspace, size_t *workspace_size,
-    const mluOpTensorDescriptor_t filters_grad_desc, void *filters_grad) {
-  bool is_get_workspace = workspace_size != nullptr ? true : false;
-  bool filters_grad_need_trans = false;
-
-  // call gather_nd and matmul to finish indice conv.
-  int32_t kernel_volume = indice_pairs_desc->dims[0];
-  int32_t ci = features_desc->dims[1];
-  int32_t co = output_grad_desc->dims[1];
-  int32_t max_active_num = 0;
-  for (int32_t i = 0; i < kernel_volume; ++i) {
-    max_active_num =
-        indice_num[i] > max_active_num ? indice_num[i] : max_active_num;
-  }
-
-  int64_t max_input_size =
-      max_active_num * ci * mluop::getSizeOfDataType(features_desc->dtype);
-  int64_t max_diffy_size =
-      max_active_num * co * mluop::getSizeOfDataType(features_desc->dtype);
-  int64_t filters_grad_trans_size =
-      filters_grad_need_trans ? filters_grad_desc->total_tensor_size : 0;
-
-  void *filters_grad_temp = filters_grad_need_trans ? workspace : filters_grad;
-  void *input_temp = (char *)workspace + filters_grad_trans_size;
-  void *diffy_temp = (char *)input_temp + max_input_size;
-  void *matmul_ws = (char *)diffy_temp + max_diffy_size;
-
-  // create temp tensor for gather and matmul
-  mluOpTensorDescriptor_t active_indice_desc;
-  mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc;
-  cnnlMatMulDescriptor_t matmul_desc;
-  cnnlMatMulAlgo_t matmul_algo;
-  cnnlMatMulHeuristicResult_t heuristic_result;
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&active_indice_desc));
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc));
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc));
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc));
-  CALL_CNNL(cnnlMatMulDescCreate(&matmul_desc));
-  CALL_CNNL(cnnlMatMulAlgoCreate(&matmul_algo));
-  CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result));
-  CHECK_RETURN(
-      api_name,
-      setMatmulDescInfo(api_name, matmul_desc, 1, 0,
-                        (uint32_t)getOnchipDataType(filters_grad_desc), 0));
-  int32_t requested_algo_count = 1, return_algo_count = 0;
-  float alpha = 1.0, beta = 0.0, fill_value = 0;
-  size_t matmul_ws_size = 0, temp_matmul_size = 0;
-
-  // filters_grad fill for unused kernel
-  if (!is_get_workspace) {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filters_grad_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, filters_grad_temp));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-
-  int64_t in_active_num = indice_pairs_desc->dims[2];
-  int64_t cico_size =
-      ci * co * mluop::getSizeOfDataType(filters_grad_desc->dtype);
-  int64_t pair_low_size =
-      in_active_num * mluop::getSizeOfDataType(indice_pairs_desc->dtype);
-
-  for (int32_t i = 0; i < kernel_volume; ++i) {
-    int32_t active_point_num = indice_num[i];
-    if (active_point_num <= 0) {
-      continue;
-    }
-
-    int32_t active_indices[2] = {active_point_num, 1};
-    int32_t a_desc_dims[2] = {active_point_num, ci};
-    int32_t b_desc_dims[2] = {active_point_num, co};
-    int32_t c_desc_dims[2] = {ci, co};
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               active_indice_desc, MLUOP_LAYOUT_ARRAY,
-                               indice_pairs_desc->dtype, 2, active_indices));
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               matmul_a_desc, MLUOP_LAYOUT_ARRAY,
-                               features_desc->dtype, 2, a_desc_dims));
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               matmul_b_desc, MLUOP_LAYOUT_ARRAY,
-                               output_grad_desc->dtype, 2, b_desc_dims));
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               matmul_c_desc, MLUOP_LAYOUT_ARRAY,
-                               filters_grad_desc->dtype, 2, c_desc_dims));
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_c_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_d_desc);
-      CALL_CNNL(cnnlGetMatMulAlgoHeuristic(
-          cnnl_handle, matmul_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc,
-          cnnl_d_desc, nullptr, requested_algo_count, &heuristic_result,
-          &return_algo_count));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo,
-                                           &temp_matmul_size));
-
-    if (is_get_workspace) {
-      matmul_ws_size =
-          temp_matmul_size > matmul_ws_size ? temp_matmul_size : matmul_ws_size;
-    } else {
-      void *filters_grad_buffer = (char *)filters_grad_temp + i * cico_size;
-      void *gather_input_indice = (char *)indice_pairs + i * 2 * pair_low_size;
-      void *gather_output_grad =
-          (char *)indice_pairs + i * 2 * pair_low_size + pair_low_size;
-      // gather activate input data [n, ci]
-      {
-        DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_desc,
-                                                     cnnl_params_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(active_indice_desc,
-                                                     cnnl_indices_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc,
-                                                     cnnl_output_desc);
-        CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, features,
-                               cnnl_indices_desc, gather_input_indice,
-                               cnnl_output_desc, input_temp));
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-        DESTROY_CNNL_HANDLE(cnnl_handle);
-      }
-      // gatehr activate diffy data [n, co]
-      {
-        DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_grad_desc,
-                                                     cnnl_params_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(active_indice_desc,
-                                                     cnnl_indices_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc,
-                                                     cnnl_output_desc);
-        CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, output_grad,
-                               cnnl_indices_desc, gather_output_grad,
-                               cnnl_output_desc, diffy_temp));
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-        DESTROY_CNNL_HANDLE(cnnl_handle);
-      }
-      // get part filters_grad [ci, co]
-      {
-        DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc,
-                                                     cnnl_a_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc,
-                                                     cnnl_b_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc,
-                                                     cnnl_c_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc,
-                                                     cnnl_d_desc);
-        CALL_CNNL(
-            cnnlMatMul_v2(cnnl_handle, matmul_desc, matmul_algo, &alpha,
-                          cnnl_a_desc, input_temp, cnnl_b_desc, diffy_temp,
-                          &beta, cnnl_c_desc, filters_grad_buffer, matmul_ws,
-                          temp_matmul_size, cnnl_d_desc, filters_grad_buffer));
-
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc);
-        DESTROY_CNNL_HANDLE(cnnl_handle);
-      }
-    }
-  }
-
-  // trans temp filters_grad if needed
-  uint64_t trans_ws_size = 0;
-  if (filters_grad_need_trans) {
-    void *trans_ws = input_temp;  // multiplexing of space
-    CHECK_RETURN(
-        api_name,
-        insertTranspose(api_name, handle, filters_grad_desc, filters_grad_temp,
-                        filters_grad, trans_ws, &trans_ws_size,
-                        is_get_workspace, kernel_volume, ci, co));
-  }
-
-  if (is_get_workspace) {
-    *workspace_size = filters_grad_trans_size +
-                      std::max(trans_ws_size, max_input_size + max_diffy_size +
-                                                  matmul_ws_size);
-  }
-
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(active_indice_desc));
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_a_desc));
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_b_desc));
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_c_desc));
-  CALL_CNNL(cnnlMatMulDescDestroy(matmul_desc));
-  CALL_CNNL(cnnlMatMulAlgoDestroy(matmul_algo));
-  CALL_CNNL(cnnlDestroyMatMulHeuristicResult(heuristic_result));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-/***************** workspace **************************/
-/*| temp filters_grad | temp features | temp output_grad| matmul_ws | */
-/*| temp filters_grad | transpose_ws | */
-/* multiplexing of space:(transpose_ws, temp_input + temp_diffy +
- * matmul_ws) */
-mluOpStatus_t MLUOP_WIN_API
-mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc,
-    const mluOpTensorDescriptor_t output_grad_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc,
-    const mluOpTensorDescriptor_t filters_grad_desc, const int64_t indice_num[],
-    const int64_t inverse, const int64_t subm, size_t *size) {
-  const std::string api_name =
-      "[mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize]";
-  PARAM_CHECK(api_name, size != nullptr);
-  auto basic_check =
-      baseParamCheck(api_name, handle, features_desc, output_grad_desc,
-                     indice_pairs_desc, filters_grad_desc, indice_num, inverse);
-  if (MLUOP_STATUS_SUCCESS != basic_check) {
-    return basic_check;
-  }
-
-  // zero element check
-  if (0 == features_desc->total_element_num ||
-      0 == output_grad_desc->total_element_num ||
-      0 == indice_pairs_desc->total_element_num ||
-      0 == filters_grad_desc->total_element_num) {
-    VLOG(5) << api_name << " Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  CHECK_RETURN(api_name,
-               internalIndiceConvBackwardFilter(
-                   api_name, handle, features_desc, nullptr, output_grad_desc,
-                   nullptr, indice_pairs_desc, nullptr, indice_num, nullptr,
-                   size, filters_grad_desc, nullptr));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionBackwardFilter(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc,
-    const void *features, const mluOpTensorDescriptor_t output_grad_desc,
-    const void *output_grad, const mluOpTensorDescriptor_t indice_pairs_desc,
-    const void *indice_pairs, const int64_t indice_num[], const int64_t inverse,
-    const int64_t subm, void *workspace, size_t workspace_size,
-    const mluOpTensorDescriptor_t filters_grad_desc, void *filters_grad) {
-  const std::string api_name = "[mluOpIndiceConvolutionBackwardFilter]";
-
-  auto basic_check =
-      baseParamCheck(api_name, handle, features_desc, output_grad_desc,
-                     indice_pairs_desc, filters_grad_desc, indice_num, inverse);
-  if (MLUOP_STATUS_SUCCESS != basic_check) {
-    return basic_check;
-  }
-
-  // zero element check
-  if (0 == features_desc->total_element_num ||
-      0 == output_grad_desc->total_element_num ||
-      0 == indice_pairs_desc->total_element_num ||
-      0 == filters_grad_desc->total_element_num) {
-    VLOG(5) << api_name << " Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // check data ptr
-  PARAM_CHECK(api_name, features != nullptr);
-  PARAM_CHECK(api_name, output_grad != nullptr);
-  PARAM_CHECK(api_name, indice_pairs != nullptr);
-  PARAM_CHECK(api_name, filters_grad != nullptr);
-  if (workspace_size > 0) {
-    PARAM_CHECK(api_name, workspace != nullptr);
-  }
-
-  // gen_case
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    indiceConvFilterGencase(handle, features_desc, features, output_grad_desc,
-                            output_grad, indice_pairs_desc, indice_pairs,
-                            indice_num, inverse, subm, workspace,
-                            workspace_size, filters_grad_desc, filters_grad);
-  }
-
-  CHECK_RETURN(api_name,
-               internalIndiceConvBackwardFilter(
-                   api_name, handle, features_desc, features, output_grad_desc,
-                   output_grad, indice_pairs_desc, indice_pairs, indice_num,
-                   workspace, nullptr, filters_grad_desc, filters_grad));
-
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/indice_convolution_forward/indice_convolution_forward.cpp b/kernels/indice_convolution_forward/indice_convolution_forward.cpp
deleted file mode 100644
index 3c7f024f8..000000000
--- a/kernels/indice_convolution_forward/indice_convolution_forward.cpp
+++ /dev/null
@@ -1,636 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <algorithm>
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/mlu_env.h"
-#include "core/tensor.h"
-#include "core/type.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-#include "mlu_op.h"
-
-static mluOpStatus_t foolProof(
-    const std::string api_name, mluOpHandle_t handle,
-    const mluOpTensorDescriptor_t features_desc,
-    const mluOpTensorDescriptor_t filters_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc, const int64_t indice_num[],
-    const int64_t num_act_out, const int64_t inverse, const int64_t sub_m,
-    const mluOpTensorDescriptor_t features_out_desc) {
-  // nullptr check
-  PARAM_CHECK(api_name, handle != nullptr);
-  PARAM_CHECK(api_name, features_desc != nullptr);
-  PARAM_CHECK(api_name, filters_desc != nullptr);
-  PARAM_CHECK(api_name, indice_pairs_desc != nullptr);
-  PARAM_CHECK(api_name, indice_num != nullptr);
-  PARAM_CHECK(api_name, features_out_desc != nullptr);
-
-  // platform check
-  if (handle->arch < 372) {
-    LOG(ERROR) << api_name << "Only mlu300 and above devices are supported."
-               << "Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-
-  // data type check
-  PARAM_CHECK(api_name, features_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                            features_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK(api_name, filters_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                            filters_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK(api_name, indice_pairs_desc->dtype == MLUOP_DTYPE_INT32);
-  PARAM_CHECK(api_name, features_out_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                            features_out_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK(api_name, features_desc->dtype == features_out_desc->dtype &&
-                            features_desc->dtype == filters_desc->dtype);
-
-  // inverse not supported now
-  PARAM_CHECK(api_name, sub_m == 0 || sub_m == 1);
-  PARAM_CHECK(api_name, inverse == 0 || inverse == 1);
-  if (inverse != 0) {
-    LOG(ERROR) << api_name << "inverse is: " << inverse
-               << ", which is not supported now.";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  // layout check
-  // DHWCN layout not supported yet, use ARRAY temporarily
-  // PARAM_CHECK(api_name, filters_desc->layout == MLUOP_LAYOUT_DHWCN);
-  if (filters_desc->layout != MLUOP_LAYOUT_NDHWC &&
-      filters_desc->layout != MLUOP_LAYOUT_NCDHW &&
-      filters_desc->layout != MLUOP_LAYOUT_ARRAY) {
-    LOG(ERROR) << api_name << "The layout of filters is: "
-               << mluOpGetNameOfTensorLayout(filters_desc->layout)
-               << ", which is not supported now.";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  // shape check
-  PARAM_CHECK(api_name, features_desc->dim == 2);
-  PARAM_CHECK(api_name, indice_pairs_desc->dim == 3);
-  PARAM_CHECK(api_name, features_out_desc->dim == 2);
-  if (filters_desc->dim != 5) {
-    LOG(ERROR) << api_name
-               << "The filters dimension number only support 5 currently,"
-               << " but filters dimension number is :" << filters_desc->dim
-               << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  // large tensor
-  if (mluOpGetTensorElementNum(features_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(filters_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(indice_pairs_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(features_out_desc) >= LARGE_TENSOR_NUM) {
-    LOG(ERROR) << api_name << "Max tensor number overflow. Currently, "
-               << "MLU-OPS supports tensor elemenets number smaller than 2^31.";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  auto ci = 0;
-  auto num_filter = 0;
-  auto co = 0;
-  if (filters_desc->layout == MLUOP_LAYOUT_ARRAY) {
-    ci = filters_desc->dims[3];
-    num_filter =
-        filters_desc->dims[0] * filters_desc->dims[1] * filters_desc->dims[2];
-    co = filters_desc->dims[4];
-  } else {
-    ci = mluOpGetTensordimC(filters_desc);
-    num_filter = mluOpGetTensordimD(filters_desc) *
-                 mluOpGetTensordimH(filters_desc) *
-                 mluOpGetTensordimW(filters_desc);
-    co = mluOpGetTensordimN(filters_desc);
-  }
-
-  // features shape check
-  PARAM_CHECK(api_name, features_desc->dims[0] == indice_pairs_desc->dims[2]);
-  PARAM_CHECK(api_name, features_desc->dims[1] == ci);
-
-  // indice_pairs shape check
-  PARAM_CHECK(api_name, indice_pairs_desc->dims[0] == num_filter);
-  PARAM_CHECK(api_name, indice_pairs_desc->dims[1] == 2);
-
-  // features_out shape check
-  PARAM_CHECK(api_name, features_out_desc->dims[0] == num_act_out);
-  PARAM_CHECK(api_name, features_out_desc->dims[1] == co);
-
-  // indice_num[] check
-  for (int i = 0; i < num_filter; ++i) {
-    PARAM_CHECK(api_name,
-                indice_num[i] >= 0 && indice_num[i] <= features_desc->dims[0]);
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t mainIndiceConvolutionForward(
-    const std::string api_name, mluOpHandle_t handle,
-    const mluOpTensorDescriptor_t features_desc, const void *features,
-    const mluOpTensorDescriptor_t filters_desc, const void *filters,
-    const mluOpTensorDescriptor_t indice_pairs_desc, const void *indice_pairs,
-    const int64_t indice_num[], const int64_t num_act_out, void *workspace,
-    size_t *workspace_size, const mluOpTensorDescriptor_t features_out_desc,
-    void *features_out) {
-  // param init
-  bool is_workspace_compute = workspace_size != nullptr ? true : false;
-  bool filters_need_trans = true;
-  int32_t ci = 0;
-  int32_t co = 0;
-  // MLUOP_LAYOUT_DHWCN not supported yet.
-  if (filters_desc->layout == MLUOP_LAYOUT_ARRAY) {
-    filters_need_trans = false;
-    ci = filters_desc->dims[3];
-    co = filters_desc->dims[4];
-  } else {
-    ci = mluOpGetTensordimC(filters_desc);
-    co = mluOpGetTensordimN(filters_desc);
-  }
-  int32_t num_filter = indice_pairs_desc->dims[0];
-
-  int64_t num_act_in = indice_pairs_desc->dims[2];
-  int64_t elementSize_filters =
-      ci * co * mluop::getSizeOfDataType(filters_desc->dtype);
-  int64_t elementSize_indice_pairs =
-      num_act_in * mluop::getSizeOfDataType(indice_pairs_desc->dtype);
-
-  int32_t max_indice_num = 0;
-  for (int i = 0; i < num_filter; ++i) {
-    max_indice_num =
-        indice_num[i] > max_indice_num ? indice_num[i] : max_indice_num;
-  }
-  size_t workspaceSize_gather =
-      max_indice_num * ci * mluop::getSizeOfDataType(features_desc->dtype);
-  size_t workspaceSize_matmul =
-      max_indice_num * co * mluop::getSizeOfDataType(features_out_desc->dtype);
-  size_t workspaceSize_transpose = 0;
-  size_t workspaceSize_transposeExtra = 0;
-  if (filters_need_trans) {
-    workspaceSize_transpose =
-        num_filter * ci * co * mluop::getSizeOfDataType(filters_desc->dtype);
-  }
-  size_t workspaceSize_scatter =
-      num_act_out * co * mluop::getSizeOfDataType(features_out_desc->dtype);
-  size_t workspaceSize_matmulExtra = 0;
-  size_t tempSize_matmulExtra = 0;
-  size_t workspaceSize_addNExtra = 0;
-  size_t tempSize_addNExtra = 0;
-  size_t workspaceSize_maximum = 0;
-
-  float matmul_alpha = 1.0;
-  float matmul_beta = 0.0;
-  int matmul_requested_algo = 1;
-  int matmul_recieved_algo = 0;
-  int matmul_is_transA = 0;
-  int matmul_is_transB = 0;
-  uint32_t matmul_allow_TF32 = 0;
-  uint32_t matmul_computetype = (uint32_t)filters_desc->dtype;
-
-  // allocate workspace segment for intermediate data
-  void *validFilters_ptr = filters_need_trans ? workspace : (void *)filters;
-  void *transposeExtra_ptr = (char *)workspace + workspaceSize_transpose;
-  void *matmulResult_ptr = (char *)workspace + workspaceSize_transpose;
-  void *gatherResult_ptr = (char *)matmulResult_ptr + workspaceSize_matmul;
-  void *matmulExtra_ptr = (char *)gatherResult_ptr + workspaceSize_gather;
-  void *scatterResult_ptr = (char *)matmulResult_ptr + workspaceSize_matmul;
-  void *addNExtra_ptr = (char *)scatterResult_ptr + workspaceSize_scatter;
-  void *addN_ptrs[2] = {scatterResult_ptr, features_out};
-
-  // create intermediate tensor
-  mluOpTensorDescriptor_t active_indice_desc;
-  mluOpTensorDescriptor_t matmul_a_desc, matmul_b_desc, matmul_c_desc;
-  cnnlMatMulDescriptor_t matmul_desc;
-  // mluOpTensorDescriptor_t addN_descriptors[2] = {features_out_desc,
-  //                                                features_out_desc};
-  cnnlMatMulAlgo_t matmul_algo;
-  cnnlMatMulHeuristicResult_t heuristic_result;
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&active_indice_desc));
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_a_desc));
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_b_desc));
-  CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&matmul_c_desc));
-  CALL_CNNL(cnnlMatMulDescCreate(&matmul_desc));
-  CALL_CNNL(cnnlMatMulAlgoCreate(&matmul_algo));
-  CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result));
-
-  CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSA,
-                                  &matmul_is_transA, sizeof(int32_t)));
-  CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_TRANSB,
-                                  &matmul_is_transB, sizeof(int32_t)));
-  CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_DESC_COMPUTE_TYPE,
-                                  &matmul_computetype, sizeof(int32_t)));
-  CALL_CNNL(cnnlSetMatMulDescAttr(matmul_desc, CNNL_MATMUL_ALLOW_TF32,
-                                  &matmul_allow_TF32, sizeof(int32_t)));
-
-  // transpose filters to DHWNC layout
-  if (filters_need_trans) {
-    int trans_in_shape[3] = {0, 0, 0};
-    int trans_out_shape[3] = {num_filter, ci, co};
-    int permute[3] = {0, 0, 0};
-    if (MLUOP_LAYOUT_NDHWC == filters_desc->layout) {
-      trans_in_shape[0] = co;
-      trans_in_shape[1] = num_filter;
-      trans_in_shape[2] = ci;
-      permute[0] = 1;
-      permute[1] = 2;
-      permute[2] = 0;
-    } else {
-      // MLUOP_LAYOUT_NCDHW == filters_desc->layout
-      trans_in_shape[0] = co;
-      trans_in_shape[1] = ci;
-      trans_in_shape[2] = num_filter;
-      permute[0] = 2;
-      permute[1] = 1;
-      permute[2] = 0;
-    }
-    mluOpTensorDescriptor_t trans_in_desc, trans_out_desc;
-    cnnlTransposeDescriptor_t trans_desc;
-    CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_in_desc));
-    CHECK_RETURN(api_name, mluOpCreateTensorDescriptor(&trans_out_desc));
-    CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc));
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               trans_in_desc, MLUOP_LAYOUT_ARRAY,
-                               filters_desc->dtype, 3, trans_in_shape));
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               trans_out_desc, MLUOP_LAYOUT_ARRAY,
-                               filters_desc->dtype, 3, trans_out_shape));
-    CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, 3, permute));
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_in_desc, cnnl_x_desc);
-      CALL_CNNL(cnnlGetTransposeWorkspaceSize(
-          cnnl_handle, cnnl_x_desc, trans_desc, &workspaceSize_transposeExtra));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    if (!is_workspace_compute) {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_in_desc, cnnl_x_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(trans_out_desc, cnnl_y_desc);
-      CALL_CNNL(cnnlTranspose_v2(
-          cnnl_handle, trans_desc, cnnl_x_desc, filters, cnnl_y_desc,
-          validFilters_ptr, transposeExtra_ptr, workspaceSize_transposeExtra));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(trans_in_desc));
-    CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(trans_out_desc));
-    CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc));
-  }
-
-  // invoke gather_nd and matmul to finish indice conv
-  int32_t active_point_num = 0;
-  int32_t active_indice[2] = {0, 1};
-  int32_t matmul_a_shape[2] = {0, ci};
-  int32_t matmul_b_shape[2] = {ci, co};
-  int32_t matmul_c_shape[2] = {0, co};
-  float init_val = 0;
-
-  if (!is_workspace_compute) {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &init_val,
-                          cnnl_output_desc, features_out));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-
-  for (int i = 0; i < num_filter; ++i) {
-    active_point_num = indice_num[i];
-    if (active_point_num <= 0) {
-      continue;
-    }
-    active_indice[0] = active_point_num;
-    matmul_a_shape[0] = active_point_num;
-    matmul_c_shape[0] = active_point_num;
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               active_indice_desc, MLUOP_LAYOUT_ARRAY,
-                               indice_pairs_desc->dtype, 2, active_indice));
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               matmul_a_desc, MLUOP_LAYOUT_ARRAY,
-                               features_desc->dtype, 2, matmul_a_shape));
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               matmul_b_desc, MLUOP_LAYOUT_ARRAY,
-                               features_out_desc->dtype, 2, matmul_b_shape));
-    CHECK_RETURN(api_name, mluOpSetTensorDescriptor(
-                               matmul_c_desc, MLUOP_LAYOUT_ARRAY,
-                               features_desc->dtype, 2, matmul_c_shape));
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc, cnnl_a_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc, cnnl_b_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_c_desc);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc, cnnl_d_desc);
-      CALL_CNNL(cnnlGetMatMulAlgoHeuristic(
-          cnnl_handle, matmul_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc,
-          cnnl_d_desc, nullptr, matmul_requested_algo, &heuristic_result,
-          &matmul_recieved_algo));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    CALL_CNNL(cnnlGetMatMulHeuristicResult(heuristic_result, matmul_algo,
-                                           &tempSize_matmulExtra));
-    uint32_t addn_num = 2;
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      cnnlTensorDescriptor_t *cnnl_input_descs =
-          (cnnlTensorDescriptor_t *)malloc(sizeof(cnnlTensorDescriptor_t) *
-                                           addn_num);
-      for (int i = 0; i < addn_num; i++) {
-        CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc,
-                                              cnnl_input_descs[i]);
-      }
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc,
-                                                   cnnl_output_desc);
-      CHECK_FUNC_RETURN(
-          cnnlGetAddNWorkspaceSize(cnnl_handle, cnnl_input_descs, addn_num,
-                                   cnnl_output_desc, &tempSize_addNExtra),
-          CNNL_STATUS_SUCCESS,
-          "[cnnlAddN_v2] Internal error accured in cnnlGetAddNWorkspaceSize.",
-          MLUOP_STATUS_INTERNAL_ERROR);
-      for (int i = 0; i < addn_num; i++) {
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_descs[i]);
-      }
-      free(cnnl_input_descs);
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-
-    if (is_workspace_compute) {
-      workspaceSize_matmulExtra =
-          tempSize_matmulExtra > workspaceSize_matmulExtra
-              ? tempSize_matmulExtra
-              : workspaceSize_matmulExtra;
-      workspaceSize_addNExtra = tempSize_addNExtra > workspaceSize_addNExtra
-                                    ? tempSize_addNExtra
-                                    : workspaceSize_addNExtra;
-    } else {
-      void *filters_buffer = (char *)validFilters_ptr + i * elementSize_filters;
-      void *gatherIndice_buffer =
-          (char *)indice_pairs + i * 2 * elementSize_indice_pairs;
-      void *scatterAddIndice_buffer =
-          (char *)indice_pairs + (i * 2 + 1) * elementSize_indice_pairs;
-      // invoke gather to get input data:
-      // [num_act_in, ci] -> [indice_pairs_num[i], ci]
-      {
-        DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_desc,
-                                                     cnnl_params_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(active_indice_desc,
-                                                     cnnl_indices_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc,
-                                                     cnnl_output_desc);
-        CALL_CNNL(cnnlGatherNd(cnnl_handle, cnnl_params_desc, features,
-                               cnnl_indices_desc, gatherIndice_buffer,
-                               cnnl_output_desc, gatherResult_ptr));
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_params_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-        DESTROY_CNNL_HANDLE(cnnl_handle);
-      }
-      // invoke matmul to get intermediate result:
-      // [indice_pairs_num[i], ci] * [ci, co] = [indice_pairs_num[i], co]
-      {
-        DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_a_desc,
-                                                     cnnl_a_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_b_desc,
-                                                     cnnl_b_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc,
-                                                     cnnl_c_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc,
-                                                     cnnl_d_desc);
-        CALL_CNNL(cnnlMatMul_v2(
-            cnnl_handle, matmul_desc, matmul_algo, &matmul_alpha, cnnl_a_desc,
-            gatherResult_ptr, cnnl_b_desc, filters_buffer, &matmul_beta,
-            cnnl_c_desc, matmulResult_ptr, matmulExtra_ptr,
-            tempSize_matmulExtra, cnnl_d_desc, matmulResult_ptr));
-
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_c_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_d_desc);
-        DESTROY_CNNL_HANDLE(cnnl_handle);
-      }
-
-      {
-        DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc,
-                                                     cnnl_output_desc);
-        CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &init_val,
-                              cnnl_output_desc, scatterResult_ptr));
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-        DESTROY_CNNL_HANDLE(cnnl_handle);
-      }
-
-      // invoke scatter_add to add intermediate result to final result:
-      // [indice_num[i], co] -> [num_act_out, co]
-      {
-        DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(active_indice_desc,
-                                                     cnnl_indices_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(matmul_c_desc,
-                                                     cnnl_updates_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc,
-                                                     cnnl_input_desc);
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc,
-                                                     cnnl_output_desc);
-
-        CALL_CNNL(cnnlScatterNd_v2(cnnl_handle, CNNL_SCATTERND_UPDATE,
-                                   cnnl_indices_desc, scatterAddIndice_buffer,
-                                   cnnl_updates_desc, matmulResult_ptr,
-                                   cnnl_input_desc, scatterResult_ptr,
-                                   cnnl_output_desc, scatterResult_ptr));
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_indices_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_updates_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-        DESTROY_CNNL_HANDLE(cnnl_handle);
-      }
-
-      {
-        int addn_num = 2;
-        DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-        cnnlTensorDescriptor_t *cnnl_input_descs =
-            (cnnlTensorDescriptor_t *)malloc(sizeof(cnnlTensorDescriptor_t) *
-                                             addn_num);
-        for (int i = 0; i < addn_num; i++) {
-          CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc,
-                                                cnnl_input_descs[i]);
-        }
-        DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(features_out_desc,
-                                                     cnnl_output_desc);
-
-        CALL_CNNL(cnnlAddN_v2(cnnl_handle, cnnl_input_descs, addN_ptrs,
-                              addn_num, cnnl_output_desc, features_out,
-                              addNExtra_ptr, tempSize_addNExtra));
-        for (int i = 0; i < addn_num; i++) {
-          DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_descs[i]);
-        }
-        free(cnnl_input_descs);
-        DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-        DESTROY_CNNL_HANDLE(cnnl_handle);
-      }
-    }
-  }
-  if (is_workspace_compute) {
-    workspaceSize_maximum = std::max(
-        workspaceSize_matmul + workspaceSize_gather + workspaceSize_matmulExtra,
-        workspaceSize_transposeExtra);
-    workspaceSize_maximum = std::max(
-        workspaceSize_matmul + workspaceSize_scatter + workspaceSize_addNExtra,
-        workspaceSize_maximum);
-    *workspace_size = workspaceSize_transpose + workspaceSize_maximum;
-  }
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(active_indice_desc));
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_a_desc));
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_b_desc));
-  CHECK_RETURN(api_name, mluOpDestroyTensorDescriptor(matmul_c_desc));
-  CALL_CNNL(cnnlMatMulDescDestroy(matmul_desc));
-  CALL_CNNL(cnnlMatMulAlgoDestroy(matmul_algo));
-  CALL_CNNL(cnnlDestroyMatMulHeuristicResult(heuristic_result));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-// workspace composition:
-// | transposed filters | transpose_extra |
-//                      ||
-//                      \/
-// | transposed filters | matmul_result | gather_result | matmul_extra |
-//                      ||
-//                      \/
-// | transposed filters | matmul_result | scatter_result | addN_extra |
-mluOpStatus_t MLUOP_WIN_API mluOpGetIndiceConvolutionForwardWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc,
-    const mluOpTensorDescriptor_t filters_desc,
-    const mluOpTensorDescriptor_t indice_pairs_desc,
-    const mluOpTensorDescriptor_t features_out_desc, const int64_t indice_num[],
-    const int64_t num_act_out, const int64_t inverse, const int64_t sub_m,
-    size_t *size) {
-  const std::string api_name =
-      "[mluOpGetIndiceConvolutionForwardWorkspaceSize]";
-
-  // foolproof check
-  auto fool_proof = foolProof(api_name, handle, features_desc, filters_desc,
-                              indice_pairs_desc, indice_num, num_act_out,
-                              inverse, sub_m, features_out_desc);
-  if (fool_proof != MLUOP_STATUS_SUCCESS) {
-    return fool_proof;
-  }
-
-  // zero element
-  if (mluOpGetTensorElementNum(features_desc) == 0 ||
-      mluOpGetTensorElementNum(indice_pairs_desc) == 0 ||
-      mluOpGetTensorElementNum(filters_desc) == 0 ||
-      mluOpGetTensorElementNum(features_out_desc) == 0) {
-    VLOG(5) << api_name << "Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // nullptr check
-  PARAM_CHECK(api_name, size != nullptr);
-
-  // main process
-  CHECK_RETURN(api_name,
-               mainIndiceConvolutionForward(
-                   api_name, handle, features_desc, nullptr, filters_desc,
-                   nullptr, indice_pairs_desc, nullptr, indice_num, num_act_out,
-                   nullptr, size, features_out_desc, nullptr));
-  VLOG(5) << api_name << "workspace size: " << *size << ".";
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpIndiceConvolutionForward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t features_desc,
-    const void *features, const mluOpTensorDescriptor_t filters_desc,
-    const void *filters, const mluOpTensorDescriptor_t indice_pairs_desc,
-    const void *indice_pairs, const int64_t indice_num[],
-    const int64_t num_act_out, const int64_t inverse, const int64_t sub_m,
-    void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t features_out_desc, void *features_out) {
-  const std::string api_name = "[mluOpIndiceConvolutionForward]";
-
-  // foolproof check
-  auto fool_proof = foolProof(api_name, handle, features_desc, filters_desc,
-                              indice_pairs_desc, indice_num, num_act_out,
-                              inverse, sub_m, features_out_desc);
-  if (fool_proof != MLUOP_STATUS_SUCCESS) {
-    return fool_proof;
-  }
-
-  // zero element
-  if (mluOpGetTensorElementNum(filters_desc) == 0 ||
-      mluOpGetTensorElementNum(features_desc) == 0 ||
-      mluOpGetTensorElementNum(indice_pairs_desc) == 0 ||
-      mluOpGetTensorElementNum(features_out_desc) == 0) {
-    VLOG(5) << api_name << "Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // data pointer nullptr check
-  PARAM_CHECK(api_name, features != nullptr);
-  PARAM_CHECK(api_name, filters != nullptr);
-  PARAM_CHECK(api_name, indice_pairs != nullptr);
-  PARAM_CHECK(api_name, features_out != nullptr);
-  if (workspace_size > 0) {
-    PARAM_CHECK(api_name, workspace != nullptr);
-  }
-
-  // gen_case
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("indice_convolution_forward");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA_REAL(true, "features", features, features_desc);
-    GEN_CASE_DATA_REAL(true, "filters", filters, filters_desc);
-    GEN_CASE_DATA_REAL(true, "indice_pairs_desc", indice_pairs,
-                       indice_pairs_desc);
-    GEN_CASE_DATA_REAL(false, "features_out", features_out, features_out_desc);
-    GEN_CASE_OP_PARAM_SINGLE(0, "indice_convolution_forward", "inverse",
-                             inverse);
-    GEN_CASE_OP_PARAM_SINGLE(1, "indice_convolution_forward", "sub_m", sub_m);
-    GEN_CASE_OP_PARAM_ARRAY(1, "indice_convolution_forward", "indice_num",
-                            indice_num, indice_pairs_desc->dims[0]);
-    GEN_CASE_OP_PARAM_SINGLE(1, "indice_convolution_forward", "num_active_out",
-                             num_act_out);
-    GEN_CASE_HANDLE_PARAM();
-    GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-  }
-
-  // main process
-  CHECK_RETURN(api_name, mainIndiceConvolutionForward(
-                             api_name, handle, features_desc, features,
-                             filters_desc, filters, indice_pairs_desc,
-                             indice_pairs, indice_num, num_act_out, workspace,
-                             nullptr, features_out_desc, features_out));
-
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_END();
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/masked_col2im_forward/masked_col2im_forward.cpp b/kernels/masked_col2im_forward/masked_col2im_forward.cpp
deleted file mode 100644
index 97d30717c..000000000
--- a/kernels/masked_col2im_forward/masked_col2im_forward.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "masked_col2im_forward.h"
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-
-static void policyFunc(const mluOpHandle_t handle, const int mask_cnt,
-                       cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  const size_t cluster_limit =
-      mluop::runtime::getClusterLimitCapability(handle);
-  const size_t core_limit =
-      mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  const size_t task_dim = CEIL_ALIGN(mask_cnt, core_limit);
-  k_dim->x = core_limit;
-  k_dim->y = (task_dim / core_limit) > cluster_limit ? cluster_limit
-                                                     : (task_dim / core_limit);
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-}
-
-static mluOpStatus_t maskedCol2imForwardPreCheck(
-    const mluOpTensorDescriptor_t col_desc,
-    const mluOpTensorDescriptor_t mask_h_idx_desc,
-    const mluOpTensorDescriptor_t mask_w_idx_desc,
-    const mluOpTensorDescriptor_t im_desc) {
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", col_desc != NULL);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_h_idx_desc != NULL);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_w_idx_desc != NULL);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", im_desc != NULL);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", col_desc->dim == 2);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", im_desc->dim == 4);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_h_idx_desc->dim == 1);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_w_idx_desc->dim == 1);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]",
-              im_desc->layout == MLUOP_LAYOUT_NCHW);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]",
-              col_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                  col_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", col_desc->dtype == im_desc->dtype);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]",
-              mask_h_idx_desc->dtype == MLUOP_DTYPE_INT32);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]",
-              mask_w_idx_desc->dtype == MLUOP_DTYPE_INT32);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", im_desc->dims[0] == 1);
-
-  PARAM_CHECK("[mluOpMaskedCol2imForward]",
-              mask_h_idx_desc->dims[0] == mask_w_idx_desc->dims[0]);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]",
-              col_desc->dims[1] == mask_h_idx_desc->dims[0]);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]",
-              col_desc->dims[0] == im_desc->dims[1]);
-
-  const size_t col_element_num = mluOpGetTensorElementNum(col_desc);
-  const size_t mask_h_idx_element_num =
-      mluOpGetTensorElementNum(mask_h_idx_desc);
-  const size_t im_element_num = mluOpGetTensorElementNum(im_desc);
-  TENSOR_NUM_CHECK("[mluOpMaskedCol2imForward]", col_element_num,
-                   LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK("[mluOpMaskedCol2imForward]", mask_h_idx_element_num,
-                   LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK("[mluOpMaskedCol2imForward]", im_element_num,
-                   LARGE_TENSOR_NUM, "");
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetMaskedCol2imForwardWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t col_desc,
-    const mluOpTensorDescriptor_t mask_h_idx_desc,
-    const mluOpTensorDescriptor_t mask_w_idx_desc,
-    const mluOpTensorDescriptor_t im_desc, size_t *workspace_size) {
-  mluOpStatus_t status = MLUOP_STATUS_BAD_PARAM;
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", handle != NULL);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", workspace_size != NULL);
-  status = maskedCol2imForwardPreCheck(col_desc, mask_h_idx_desc,
-                                       mask_w_idx_desc, im_desc);
-  if (MLUOP_STATUS_SUCCESS != status) {
-    return status;
-  }
-  if (mluOpGetTensorElementNum(im_desc) == 0 || col_desc->dims[0] == 0) {
-    LOG(ERROR) << "[mluOpMaskedCol2imForward] Zero element tensor failure.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (mluOpGetTensorElementNum(mask_h_idx_desc) == 0) {
-    VLOG(5) << "[mluOpMaskedCol2imForward] Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-  *workspace_size = col_desc->total_tensor_size;
-  *workspace_size += im_desc->total_tensor_size;
-
-  cnnlTransposeDescriptor_t trans_desc;
-  size_t col_transpose_workspace_size = 0;
-  int col_dim = col_desc->dim;
-  int col_permute[2] = {1, 0};
-  int col_MC_dims[2] = {0, 0};
-  col_MC_dims[0] = col_desc->dims[1];
-  col_MC_dims[1] = col_desc->dims[0];
-  mluOpTensorDescriptor_t col_MC_desc_tmp;
-  MLUOP_CHECK(mluOpCreateTensorDescriptor(&col_MC_desc_tmp));
-
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(col_MC_desc_tmp, MLUOP_LAYOUT_ARRAY,
-                                   col_desc->dtype, col_dim, col_MC_dims));
-  CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc));
-  CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, col_dim, col_permute));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(col_MC_desc_tmp, cnnl_x_desc);
-    CALL_CNNL(cnnlGetTransposeWorkspaceSize(
-        cnnl_handle, cnnl_x_desc, trans_desc, &col_transpose_workspace_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  int im_dim = im_desc->dim;
-  int im_permute[4] = {0, 3, 1, 2};
-  int NCHW2NHWC_permute[4] = {0, 2, 3, 1};
-  int im_NHWC_dims[4] = {0, 0, 0, 0};
-  for (int i = 0; i < im_dim; ++i) {
-    im_NHWC_dims[i] = im_desc->dims[NCHW2NHWC_permute[i]];
-  }
-  size_t im_transpose_workspace_size = 0;
-  mluOpTensorDescriptor_t im_NHWC_desc_tmp;
-  MLUOP_CHECK(mluOpCreateTensorDescriptor(&im_NHWC_desc_tmp));
-
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(im_NHWC_desc_tmp, MLUOP_LAYOUT_ARRAY,
-                                   im_desc->dtype, im_dim, im_NHWC_dims));
-  CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, im_dim, im_permute));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(im_NHWC_desc_tmp, cnnl_x_desc);
-    CALL_CNNL(cnnlGetTransposeWorkspaceSize(
-        cnnl_handle, cnnl_x_desc, trans_desc, &im_transpose_workspace_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  *workspace_size += im_transpose_workspace_size > col_transpose_workspace_size
-                         ? im_transpose_workspace_size
-                         : col_transpose_workspace_size;
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(im_NHWC_desc_tmp));
-  CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc));
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(col_MC_desc_tmp));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t transposeTensor(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc,
-    const void *input, const int *permute,
-    const mluOpTensorDescriptor_t workspace_dst_desc, void *workspace_dst,
-    void *transpose_workspace) {
-  const int input_dim = input_desc->dim;
-  cnnlTransposeDescriptor_t trans_desc;
-  size_t transpose_workspace_size = 0;
-  CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc));
-  CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, input_dim, permute));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_x_desc);
-    CALL_CNNL(cnnlGetTransposeWorkspaceSize(
-        cnnl_handle, cnnl_x_desc, trans_desc, &transpose_workspace_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_x_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(workspace_dst_desc,
-                                                 cnnl_y_desc);
-    CALL_CNNL(cnnlTranspose_v2(cnnl_handle, trans_desc, cnnl_x_desc, input,
-                               cnnl_y_desc, workspace_dst, transpose_workspace,
-                               transpose_workspace_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpMaskedCol2imForward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t col_desc,
-    const void *col, const mluOpTensorDescriptor_t mask_h_idx_desc,
-    const void *mask_h_idx, const mluOpTensorDescriptor_t mask_w_idx_desc,
-    const void *mask_w_idx, const size_t workspace_size, void *workspace,
-    const mluOpTensorDescriptor_t im_desc, void *im) {
-  mluOpStatus_t status = MLUOP_STATUS_BAD_PARAM;
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", handle != NULL);
-  status = maskedCol2imForwardPreCheck(col_desc, mask_h_idx_desc,
-                                       mask_w_idx_desc, im_desc);
-  if (MLUOP_STATUS_SUCCESS != status) {
-    return status;
-  }
-  if (mluOpGetTensorElementNum(im_desc) == 0 || col_desc->dims[0] == 0) {
-    LOG(ERROR) << "[mluOpMaskedCol2imForward] Zero element tensor failure.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (mluOpGetTensorElementNum(mask_h_idx_desc) == 0) {
-    VLOG(5) << "[mluOpMaskedCol2imForward] Skip zero element tensor.";
-    uint64_t fill_value = 0x0;
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(im_desc, cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, im));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if (workspace_size > 0) {
-    PARAM_CHECK("[mluOpMaskedCol2imForward]", workspace != NULL);
-  }
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", col != NULL);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_h_idx != NULL);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", mask_w_idx != NULL);
-  PARAM_CHECK("[mluOpMaskedCol2imForward]", im != NULL);
-
-  // generate mluOpMaskedCol2imForward prototxt start!
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("masked_col2im_forward");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA(true, "col", col, col_desc, -10, 10);
-    GEN_CASE_DATA_REAL(true, "mask_h_idx", mask_h_idx, mask_h_idx_desc);
-    GEN_CASE_DATA_REAL(true, "mask_w_idx", mask_w_idx, mask_w_idx_desc);
-    GEN_CASE_DATA(false, "im", im, im_desc, 0, 0);
-    GEN_CASE_TEST_PARAM_NEW(false, false, true, 0, 0, 0);
-  }
-  // generate mluOpMaskedCol2imForward prototxt end!
-  mluOpDataType_t input_dtype = col_desc->dtype;
-  void *col_workspace = workspace;
-  void *im_workspace = (char *)workspace + col_desc->total_tensor_size;
-  void *transpose_workspace = (char *)im_workspace + im_desc->total_tensor_size;
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  const int mask_cnt = mask_h_idx_desc->dims[0];
-  policyFunc(handle, mask_cnt, &k_dim, &k_type);
-
-  VLOG(5) << "[mluOpMaskedCol2imForward] cnnlFill_v3 start.";
-  const int im_dim = im_desc->dim;
-  int NCHW2NHWC_permute[4] = {0, 2, 3, 1};
-  int im_NHWC_dims[4] = {0, 0, 0, 0};
-  for (int i = 0; i < im_dim; ++i) {
-    im_NHWC_dims[i] = im_desc->dims[NCHW2NHWC_permute[i]];
-  }
-  mluOpTensorDescriptor_t im_NHWC_desc_tmp;
-  MLUOP_CHECK(mluOpCreateTensorDescriptor(&im_NHWC_desc_tmp));
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(im_NHWC_desc_tmp, MLUOP_LAYOUT_ARRAY,
-                                   im_desc->dtype, im_dim, im_NHWC_dims));
-  uint64_t fill_value = 0x0;
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(im_NHWC_desc_tmp,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, im_workspace));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  VLOG(5) << "[mluOpMaskedCol2imForward] cnnlFill_v3 end.";
-
-  VLOG(5) << "[mluOpMaskedCol2imForward] cnnlTranspose_v2 col start.";
-
-  int col_dim = col_desc->dim;
-  int col_permute[2] = {1, 0};
-  int col_MC_dims[2] = {0, 0};
-  col_MC_dims[0] = col_desc->dims[1];
-  col_MC_dims[1] = col_desc->dims[0];
-  mluOpTensorDescriptor_t col_MC_desc_tmp;
-  MLUOP_CHECK(mluOpCreateTensorDescriptor(&col_MC_desc_tmp));
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(col_MC_desc_tmp, MLUOP_LAYOUT_ARRAY,
-                                   col_desc->dtype, col_dim, col_MC_dims));
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS ==
-          transposeTensor(handle, col_desc, col, col_permute, col_MC_desc_tmp,
-                          col_workspace, transpose_workspace));
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(col_MC_desc_tmp));
-  VLOG(5) << "[mluOpMaskedCol2imForward] cnnlTranspose_v2 col end.";
-
-  const int channels = im_desc->dims[1];
-  const int height = im_desc->dims[2];
-  const int width = im_desc->dims[3];
-  VLOG(5) << "Launch kernel MLUUnion1MaskedCol2imForward<<<" << k_dim.x << ", "
-          << k_dim.y << ", " << k_dim.z << ">>>.";
-  CHECK_RETURN("[mluOpMaskedCol2imForward]",
-               KernelMaskedCol2imForward(k_dim, k_type, handle->queue,
-                                         input_dtype, col_workspace, height,
-                                         width, channels, mask_h_idx,
-                                         mask_w_idx, mask_cnt, im_workspace));
-  VLOG(5) << "Finish launch MLUUnion1MaskedCol2imForward.";
-
-  VLOG(5) << "[mluOpMaskedCol2imForward] cnnlTranspose_v2 im start.";
-  int im_permute[4] = {0, 3, 1, 2};
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS == transposeTensor(handle, im_NHWC_desc_tmp,
-                                              im_workspace, im_permute, im_desc,
-                                              im, transpose_workspace));
-  PARAM_CHECK(
-      "[mluOpMaskedCol2imForward]",
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(im_NHWC_desc_tmp));
-  VLOG(5) << "[mluOpMaskedCol2imForward] cnnlTranspose_v2 im end.";
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/masked_col2im_forward/masked_col2im_forward.h b/kernels/masked_col2im_forward/masked_col2im_forward.h
deleted file mode 100644
index e6a7645f7..000000000
--- a/kernels/masked_col2im_forward/masked_col2im_forward.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MASKED_COL2IM_FORWARD_MASKED_COL2IM_FORWARD_H_
-#define KERNELS_MASKED_COL2IM_FORWARD_MASKED_COL2IM_FORWARD_H_
-
-#include "mlu_op.h"
-
-// decare func
-mluOpStatus_t MLUOP_WIN_API KernelMaskedCol2imForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const mluOpDataType_t data_dtype, const void *col, const int height,
-    const int width, const int channels, const void *mask_h_idx,
-    const void *mask_w_idx, const int mask_cnt, void *im);
-
-#endif  // KERNELS_MASKED_COL2IM_FORWARD_MASKED_COL2IM_FORWARD_H_
diff --git a/kernels/masked_col2im_forward/masked_col2im_forward_union1.mlu b/kernels/masked_col2im_forward/masked_col2im_forward_union1.mlu
deleted file mode 100644
index cf0ce7b07..000000000
--- a/kernels/masked_col2im_forward/masked_col2im_forward_union1.mlu
+++ /dev/null
@@ -1,121 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "masked_col2im_forward.h"
-
-#include <algorithm>
-
-#include "core/logging.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-__nram__ char data_nram[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void MLUMultiKernelMaskedCol2imForward(
-    const T *col, const int height, const int width, const int channels,
-    const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt,
-    T *im) {
-  const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
-  if (channels <= channels_max_num_nram) {
-    const int deal_num = channels_max_num_nram / channels;
-    int mask_per_core = mask_cnt / taskDim;
-    const int mask_remain = mask_cnt % taskDim;
-    mask_per_core += taskId < mask_remain ? 1 : 0;
-    int index_start = taskId < mask_remain
-                          ? taskId * mask_per_core
-                          : taskId * mask_per_core + mask_remain;
-    int loop = mask_per_core / deal_num;
-    int remain_num = mask_per_core % deal_num;
-    T *nram_col = (T *)data_nram;
-    for (int index = 0; index < loop; ++index) {
-      int cur_index = index_start + index * deal_num;
-      __memcpy(nram_col, col + cur_index * channels,
-               deal_num * channels * sizeof(T), GDRAM2NRAM);
-      for (int i = 0; i < deal_num; ++i) {
-        int mask_index = cur_index + i;
-        const int h_im = mask_h_idx[mask_index];
-        const int w_im = mask_w_idx[mask_index];
-        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
-                 channels * sizeof(T), NRAM2GDRAM);
-      }
-    }
-    if (remain_num > 0) {
-      int cur_index = index_start + loop * deal_num;
-      __memcpy(nram_col, col + cur_index * channels,
-               remain_num * channels * sizeof(T), GDRAM2NRAM);
-      for (int i = 0; i < remain_num; ++i) {
-        int mask_index = cur_index + i;
-        const int h_im = mask_h_idx[mask_index];
-        const int w_im = mask_w_idx[mask_index];
-        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
-                 channels * sizeof(T), NRAM2GDRAM);
-      }
-    }
-  } else {
-    for (int index = taskId; index < mask_cnt; index += taskDim) {
-      const int m_index = index % mask_cnt;
-      const int h_im = mask_h_idx[m_index];
-      const int w_im = mask_w_idx[m_index];
-      __memcpy(im + (h_im * width + w_im) * channels, col + index * channels,
-               channels * sizeof(T), GDRAM2GDRAM);
-    }
-  }
-}
-
-template <typename T>
-__mlu_entry__ void MLUUnion1MaskedCol2imForward(
-    const void *col, const int height, const int width, const int channels,
-    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
-    void *im) {
-  if (__is_mpu()) {
-    return;
-  }
-  MLUMultiKernelMaskedCol2imForward((T *)col, height, width, channels,
-                                    (int32_t *)mask_h_idx,
-                                    (int32_t *)mask_w_idx, mask_cnt, (T *)im);
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMaskedCol2imForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const mluOpDataType_t data_dtype, const void *col, const int height,
-    const int width, const int channels, const void *mask_h_idx,
-    const void *mask_w_idx, const int mask_cnt, void *im) {
-  switch (data_dtype) {
-    /* Only float and half data types are supported
-       in host-side CPP file fool-proof processing. */
-    case MLUOP_DTYPE_FLOAT: {
-      KERNEL_CHECK(
-          MLUUnion1MaskedCol2imForward<float>
-          <<<k_dim, k_type, queue>>>(col, height, width, channels, mask_h_idx,
-                                     mask_w_idx, mask_cnt, im));
-    }; break;
-    case MLUOP_DTYPE_HALF: {
-      KERNEL_CHECK(MLUUnion1MaskedCol2imForward<half><<<k_dim, k_type, queue>>>(
-          col, height, width, channels, mask_h_idx, mask_w_idx, mask_cnt, im));
-    }; break;
-    default:
-      break;
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/masked_im2col_forward/masked_im2col_forward.cpp b/kernels/masked_im2col_forward/masked_im2col_forward.cpp
deleted file mode 100644
index 04373a125..000000000
--- a/kernels/masked_im2col_forward/masked_im2col_forward.cpp
+++ /dev/null
@@ -1,373 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/masked_im2col_forward/masked_im2col_forward.h"
-
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-
-// policy function
-static void policyFunc(const mluOpHandle_t handle, const int mask_cnt,
-                       cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  const size_t cluster_limit =
-      mluop::runtime::getClusterLimitCapability(handle);
-  const size_t core_limit =
-      mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  const size_t task_dim = CEIL_ALIGN(mask_cnt, core_limit);
-  k_dim->x = core_limit;
-  k_dim->y = (task_dim / core_limit) > cluster_limit ? cluster_limit
-                                                     : (task_dim / core_limit);
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-}
-
-static mluOpStatus_t maskedIm2colForwardPreCheck(
-    const mluOpHandle_t handle, const mluOpTensorDescriptor_t feature_desc,
-    const mluOpTensorDescriptor_t mask_h_idx_desc,
-    const mluOpTensorDescriptor_t mask_w_idx_desc,
-    const mluOpTensorDescriptor_t data_col_desc, const int kernel_h,
-    const int kernel_w) {
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", handle != NULL);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", feature_desc != NULL);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_h_idx_desc != NULL);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_w_idx_desc != NULL);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", data_col_desc != NULL);
-
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              feature_desc->layout == MLUOP_LAYOUT_NCHW);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", feature_desc->dim == 4);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              feature_desc->dtype == MLUOP_DTYPE_FLOAT ||
-                  feature_desc->dtype == MLUOP_DTYPE_HALF);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              feature_desc->dtype == data_col_desc->dtype);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              mask_h_idx_desc->dtype == MLUOP_DTYPE_INT32);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              mask_w_idx_desc->dtype == MLUOP_DTYPE_INT32);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", feature_desc->dims[0] == 1);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_h_idx_desc->dim == 1);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_w_idx_desc->dim == 1);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              mask_h_idx_desc->dims[0] == mask_w_idx_desc->dims[0]);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", data_col_desc->dim == 2);
-  PARAM_CHECK(
-      "[mluOpMaskedIm2colForward]",
-      data_col_desc->dims[0] == feature_desc->dims[1] * kernel_h * kernel_w);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              data_col_desc->dims[1] == mask_h_idx_desc->dims[0]);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", kernel_h > 0);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", kernel_w > 0);
-
-  const uint64_t feature_element_num = mluOpGetTensorElementNum(feature_desc);
-  const uint64_t mask_h_idx_element_num =
-      mluOpGetTensorElementNum(mask_h_idx_desc);
-  const uint64_t data_col_element_num = mluOpGetTensorElementNum(data_col_desc);
-  TENSOR_NUM_CHECK("[mluOpMaskedIm2colForward]", feature_element_num,
-                   LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK("[mluOpMaskedIm2colForward]", mask_h_idx_element_num,
-                   LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK("[mluOpMaskedIm2colForward]", data_col_element_num,
-                   LARGE_TENSOR_NUM, "");
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetMaskedIm2colForwardWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t feature_desc,
-    const mluOpTensorDescriptor_t mask_h_idx_desc,
-    const mluOpTensorDescriptor_t mask_w_idx_desc, const int kernel_h,
-    const int kernel_w, const mluOpTensorDescriptor_t data_col_desc,
-    size_t *workspace_size) {
-  mluOpStatus_t status = MLUOP_STATUS_BAD_PARAM;
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", workspace_size != NULL);
-  status = maskedIm2colForwardPreCheck(handle, feature_desc, mask_h_idx_desc,
-                                       mask_w_idx_desc, data_col_desc, kernel_h,
-                                       kernel_w);
-  if (MLUOP_STATUS_SUCCESS != status) {
-    return status;
-  }
-  if (mluOpGetTensorElementNum(feature_desc) == 0 ||
-      data_col_desc->dims[0] == 0) {
-    LOG(ERROR) << "[mluOpMaskedIm2colForward] Zero element tensor failure.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (mluOpGetTensorElementNum(mask_h_idx_desc) == 0) {
-    VLOG(5) << "[mluOpMaskedIm2colForward] Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-  *workspace_size = feature_desc->total_tensor_size;
-  *workspace_size += data_col_desc->total_tensor_size;
-
-  cnnlTransposeDescriptor_t trans_desc;
-  size_t feature_transpose_workspace_size = 0;
-  int feature_dim = feature_desc->dim;
-  int feature_permute[4] = {0, 3, 1, 2};
-
-  CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc));
-  CALL_CNNL(
-      cnnlSetTransposeDescriptor(trans_desc, feature_dim, feature_permute));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(feature_desc, cnnl_x_desc);
-    CALL_CNNL(cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_x_desc,
-                                            trans_desc,
-                                            &feature_transpose_workspace_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  if (mluOpGetTensorElementNum(feature_desc) == 0 ||
-      data_col_desc->dims[0] == 0) {
-    VLOG(5) << "[mluOpMaskedIm2colForward] Zero element tensor failure.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  int data_col_dim = 3;
-  int data_col_permute[3] = {2, 1, 0};
-  int data_col_HWC_dims[3] = {0, 0, 0};
-  int data_col_CHW_dims[3] = {0, 0, 0};
-  data_col_HWC_dims[0] = mask_h_idx_desc->dims[0];
-  data_col_HWC_dims[1] = kernel_h * kernel_w;
-  data_col_HWC_dims[2] = feature_desc->dims[1];
-  for (int i = 0; i < data_col_dim; ++i) {
-    data_col_CHW_dims[i] = data_col_HWC_dims[data_col_permute[i]];
-  }
-  size_t data_col_transpose_workspace_size = 0;
-  mluOpTensorDescriptor_t data_col_HWC_desc_tmp;
-  MLUOP_CHECK(mluOpCreateTensorDescriptor(&data_col_HWC_desc_tmp));
-
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              MLUOP_STATUS_SUCCESS ==
-                  mluOpSetTensorDescriptor(
-                      data_col_HWC_desc_tmp, MLUOP_LAYOUT_ARRAY,
-                      feature_desc->dtype, data_col_dim, data_col_HWC_dims));
-  CALL_CNNL(
-      cnnlSetTransposeDescriptor(trans_desc, data_col_dim, data_col_permute));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(data_col_HWC_desc_tmp,
-                                                 cnnl_x_desc);
-    CALL_CNNL(
-        cnnlGetTransposeWorkspaceSize(cnnl_handle, cnnl_x_desc, trans_desc,
-                                      &data_col_transpose_workspace_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  *workspace_size +=
-      data_col_transpose_workspace_size > feature_transpose_workspace_size
-          ? data_col_transpose_workspace_size
-          : feature_transpose_workspace_size;
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              MLUOP_STATUS_SUCCESS ==
-                  mluOpDestroyTensorDescriptor(data_col_HWC_desc_tmp));
-  CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t transposeTensor(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc,
-    const void *input, const int *permute,
-    const mluOpTensorDescriptor_t workspace_dst_desc, void *workspace_dst,
-    void *transpose_workspace) {
-  int input_dim = input_desc->dim;
-  cnnlTransposeDescriptor_t trans_desc;
-  size_t transpose_workspace_size = 0;
-  CALL_CNNL(cnnlCreateTransposeDescriptor(&trans_desc));
-  CALL_CNNL(cnnlSetTransposeDescriptor(trans_desc, input_dim, permute));
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_x_desc);
-    CALL_CNNL(cnnlGetTransposeWorkspaceSize(
-        cnnl_handle, cnnl_x_desc, trans_desc, &transpose_workspace_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_x_desc);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(workspace_dst_desc,
-                                                 cnnl_y_desc);
-    CALL_CNNL(cnnlTranspose_v2(cnnl_handle, trans_desc, cnnl_x_desc, input,
-                               cnnl_y_desc, workspace_dst, transpose_workspace,
-                               transpose_workspace_size));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  CALL_CNNL(cnnlDestroyTransposeDescriptor(trans_desc));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpMaskedIm2colForward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t feature_desc,
-    const void *feature, const mluOpTensorDescriptor_t mask_h_idx_desc,
-    const void *mask_h_idx, const mluOpTensorDescriptor_t mask_w_idx_desc,
-    const void *mask_w_idx, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, void *workspace,
-    const size_t workspace_size, const mluOpTensorDescriptor_t data_col_desc,
-    void *data_col) {
-  mluOpStatus_t status = MLUOP_STATUS_BAD_PARAM;
-  status = maskedIm2colForwardPreCheck(handle, feature_desc, mask_h_idx_desc,
-                                       mask_w_idx_desc, data_col_desc, kernel_h,
-                                       kernel_w);
-  if (MLUOP_STATUS_SUCCESS != status) {
-    return status;
-  }
-
-  if (mluOpGetTensorElementNum(feature_desc) == 0 ||
-      data_col_desc->dims[0] == 0) {
-    LOG(ERROR) << "[mluOpMaskedIm2colForward] Zero element tensor failure.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (mluOpGetTensorElementNum(mask_h_idx_desc) == 0) {
-    VLOG(5) << "[mluOpMaskedIm2colForward] Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if (workspace_size > 0) {
-    PARAM_CHECK("[mluOpMaskedIm2colForward]", workspace != NULL);
-  }
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", feature != NULL);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_h_idx != NULL);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", mask_w_idx != NULL);
-  PARAM_CHECK("[mluOpMaskedIm2colForward]", data_col != NULL);
-
-  // generate mluOpMaskedIm2colForward prototxt start!
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("masked_im2col_forward");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA(true, "feature", feature, feature_desc, -10, 10);
-    GEN_CASE_DATA_REAL(true, "mask_h_idx", mask_h_idx, mask_h_idx_desc);
-    GEN_CASE_DATA_REAL(true, "mask_w_idx", mask_w_idx, mask_w_idx_desc);
-    GEN_CASE_DATA(false, "data_col", data_col, data_col_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "masked_im2col_forward", "kernel_h", kernel_h);
-    GEN_CASE_OP_PARAM_SINGLE(1, "masked_im2col_forward", "kernel_w", kernel_w);
-    GEN_CASE_OP_PARAM_SINGLE(1, "masked_im2col_forward", "pad_h", pad_h);
-    GEN_CASE_OP_PARAM_SINGLE(2, "masked_im2col_forward", "pad_w", pad_w);
-    GEN_CASE_TEST_PARAM_NEW(false, false, true, 0, 0, 0);
-  }
-  // generate mluOpMaskedIm2colForward prototxt end!
-  mluOpDataType_t input_dtype = feature_desc->dtype;
-  void *feature_workspace = workspace;
-  void *data_col_workspace =
-      (char *)workspace + feature_desc->total_tensor_size;
-  void *transpose_workspace =
-      (char *)data_col_workspace + data_col_desc->total_tensor_size;
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  const int mask_cnt = mask_h_idx_desc->dims[0];
-  policyFunc(handle, mask_cnt, &k_dim, &k_type);
-
-  VLOG(5) << "[mluOpMaskedIm2colForward] cnnlFill_v3 start.";
-  uint64_t fill_value = 0x0;
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(data_col_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, data_col_workspace));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-
-  VLOG(5) << "[mluOpMaskedIm2colForward] cnnlTranspose_v2 feature start.";
-
-  int feature_dim = feature_desc->dim;
-  int feature_permute[4] = {0, 2, 3, 1};
-  int feature_tmp_dims[4] = {0, 0, 0, 0};
-
-  for (int i = 0; i < feature_dim; ++i) {
-    feature_tmp_dims[i] = feature_desc->dims[feature_permute[i]];
-  }
-
-  mluOpTensorDescriptor_t feature_desc_tmp;
-  MLUOP_CHECK(mluOpCreateTensorDescriptor(&feature_desc_tmp));
-  PARAM_CHECK(
-      "[mluOpMaskedIm2colForward]",
-      MLUOP_STATUS_SUCCESS ==
-          mluOpSetTensorDescriptor(feature_desc_tmp, MLUOP_LAYOUT_ARRAY,
-                                   input_dtype, feature_dim, feature_tmp_dims));
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              MLUOP_STATUS_SUCCESS ==
-                  transposeTensor(handle, feature_desc, feature,
-                                  feature_permute, feature_desc_tmp,
-                                  feature_workspace, transpose_workspace));
-
-  PARAM_CHECK(
-      "[mluOpMaskedIm2colForward]",
-      MLUOP_STATUS_SUCCESS == mluOpDestroyTensorDescriptor(feature_desc_tmp));
-
-  const int channels = feature_desc->dims[1];
-  const int height = feature_desc->dims[2];
-  const int width = feature_desc->dims[3];
-  VLOG(5) << "Launch kernel MLUUnion1MaskedIm2colForward<<<" << k_dim.x << ", "
-          << k_dim.y << ", " << k_dim.z << ">>>.";
-  CHECK_RETURN("[mluOpMaskedIm2colForward]",
-               KernelMaskedIm2colForward(
-                   k_dim, k_type, handle->queue, input_dtype, feature_workspace,
-                   height, width, channels, kernel_h, kernel_w, pad_h, pad_w,
-                   mask_h_idx, mask_w_idx, mask_cnt, data_col_workspace));
-
-  VLOG(5) << "[mluOpMaskedIm2colForward] cnnlTranspose_v2 data_col start.";
-  const int data_col_dim = 3;
-  int data_col_permute[3] = {2, 1, 0};
-  int data_col_HWC_dims[3] = {0, 0, 0};
-  int data_col_CHW_dims[3] = {0, 0, 0};
-  data_col_HWC_dims[0] = mask_cnt;
-  data_col_HWC_dims[1] = kernel_h * kernel_w;
-  data_col_HWC_dims[2] = channels;
-  for (int i = 0; i < data_col_dim; ++i) {
-    data_col_CHW_dims[i] = data_col_HWC_dims[data_col_permute[i]];
-  }
-
-  mluOpTensorDescriptor_t data_col_HWC_desc_tmp;
-  mluOpTensorDescriptor_t data_col_CHW_desc_tmp;
-  MLUOP_CHECK(mluOpCreateTensorDescriptor(&data_col_HWC_desc_tmp));
-  MLUOP_CHECK(mluOpCreateTensorDescriptor(&data_col_CHW_desc_tmp));
-
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              MLUOP_STATUS_SUCCESS ==
-                  mluOpSetTensorDescriptor(data_col_HWC_desc_tmp,
-                                           MLUOP_LAYOUT_ARRAY, input_dtype,
-                                           data_col_dim, data_col_HWC_dims));
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              MLUOP_STATUS_SUCCESS ==
-                  mluOpSetTensorDescriptor(data_col_CHW_desc_tmp,
-                                           MLUOP_LAYOUT_ARRAY, input_dtype,
-                                           data_col_dim, data_col_CHW_dims));
-
-  PARAM_CHECK(
-      "[mluOpMaskedIm2colForward]",
-      MLUOP_STATUS_SUCCESS ==
-          transposeTensor(handle, data_col_HWC_desc_tmp, data_col_workspace,
-                          data_col_permute, data_col_CHW_desc_tmp, data_col,
-                          transpose_workspace));
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              MLUOP_STATUS_SUCCESS ==
-                  mluOpDestroyTensorDescriptor(data_col_HWC_desc_tmp));
-  PARAM_CHECK("[mluOpMaskedIm2colForward]",
-              MLUOP_STATUS_SUCCESS ==
-                  mluOpDestroyTensorDescriptor(data_col_CHW_desc_tmp));
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/masked_im2col_forward/masked_im2col_forward.h b/kernels/masked_im2col_forward/masked_im2col_forward.h
deleted file mode 100644
index 30440106a..000000000
--- a/kernels/masked_im2col_forward/masked_im2col_forward.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MASKED_IM2COL_FORWARD_MASKED_IM2COL_FORWARD_H_
-#define KERNELS_MASKED_IM2COL_FORWARD_MASKED_IM2COL_FORWARD_H_
-
-#include "mlu_op.h"
-
-// decare func
-mluOpStatus_t MLUOP_WIN_API KernelMaskedIm2colForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const mluOpDataType_t data_dtype, const void *feature, const int height,
-    const int width, const int channels, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const void *mask_h_idx,
-    const void *mask_w_idx, const int mask_cnt, void *data_col);
-
-#endif  // KERNELS_MASKED_IM2COL_FORWARD_MASKED_IM2COL_FORWARD_H_
diff --git a/kernels/masked_im2col_forward/masked_im2col_forward_union1.mlu b/kernels/masked_im2col_forward/masked_im2col_forward_union1.mlu
deleted file mode 100644
index 3d4203227..000000000
--- a/kernels/masked_im2col_forward/masked_im2col_forward_union1.mlu
+++ /dev/null
@@ -1,100 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/masked_im2col_forward/masked_im2col_forward.h"
-
-#include "core/logging.h"
-#include "kernels/utils/common.h"
-
-template <typename T>
-__mlu_func__ void MLUMultiKernelMaskedIm2colForward(
-    const T *feature, const int height, const int width, const int channels,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt,
-    T *data_col) {
-  for (int index = taskId; index < mask_cnt; index += taskDim) {
-    const int h_col = mask_h_idx[index];
-    const int w_col = mask_w_idx[index];
-    const int h_offset = h_col - pad_h;
-    const int w_offset = w_col - pad_w;
-    int h_start = h_offset;
-    int h_end = h_offset + kernel_h - 1;
-    int w_start = w_offset;
-    int w_end = w_start + kernel_w - 1;
-    if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) {
-      continue;
-    } else {
-      int h_start_valid = __mluop_max(0, h_start);
-      int h_end_valid = __mluop_min(height - 1, h_end);
-      int w_start_valid = __mluop_max(0, w_start);
-      int w_end_valid = __mluop_min(width - 1, w_end);
-      __memcpy(
-          data_col + index * kernel_h * kernel_w * channels +
-              ((h_start_valid - h_start) * kernel_w +
-               (w_start_valid - w_start)) *
-                  channels,
-          feature + h_start_valid * width * channels + w_start_valid * channels,
-          (w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM,
-          kernel_w * channels * sizeof(T), width * channels * sizeof(T),
-          h_end_valid - h_start_valid);
-    }
-  }
-}
-
-__mlu_entry__ void MLUUnion1MaskedIm2colForward(
-    const mluOpDataType_t data_dtype, const void *feature, const int height,
-    const int width, const int channels, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const void *mask_h_idx,
-    const void *mask_w_idx, const int mask_cnt, void *data_col) {
-  if (__is_mpu()) {
-    return;
-  }
-
-  switch (data_dtype) {
-    case MLUOP_DTYPE_HALF: {
-      MLUMultiKernelMaskedIm2colForward(
-          (half *)feature, height, width, channels, kernel_h, kernel_w, pad_h,
-          pad_w, (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, mask_cnt,
-          (half *)data_col);
-    }; break;
-    case MLUOP_DTYPE_FLOAT: {
-      MLUMultiKernelMaskedIm2colForward(
-          (float *)feature, height, width, channels, kernel_h, kernel_w, pad_h,
-          pad_w, (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, mask_cnt,
-          (float *)data_col);
-    }; break;
-    default:
-      break;
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMaskedIm2colForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const mluOpDataType_t data_dtype, const void *feature, const int height,
-    const int width, const int channels, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const void *mask_h_idx,
-    const void *mask_w_idx, const int mask_cnt, void *data_col) {
-  KERNEL_CHECK(MLUUnion1MaskedIm2colForward<<<k_dim, k_type, queue>>>(
-      data_dtype, feature, height, width, channels, kernel_h, kernel_w, pad_h,
-      pad_w, mask_h_idx, mask_w_idx, mask_cnt, data_col));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp b/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp
deleted file mode 100644
index b270359aa..000000000
--- a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "moe_dispatch_backward_data.h"
-
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"
-#include "kernels/utils/cnnl_helper.h"
-
-// policy function
-static void PolicyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim,
-                       cnrtFunctionType_t *k_type) {
-  // union1 policy func
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  // dimx equals to num of MLU Cores in each cluster
-  k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  // dimy equals to num of current available clusters
-  k_dim->y = mluop::runtime::getClusterLimitCapability(handle);
-  k_dim->z = 1;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpMoeDispatchBackwardData(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t gates_desc,
-    const void *gates, const mluOpTensorDescriptor_t indices_desc,
-    const void *indices, const mluOpTensorDescriptor_t locations_desc,
-    const void *locations, const mluOpTensorDescriptor_t dispatch_desc,
-    const void *dispatch, const int samples, const int capacity,
-    const int hidden, const int num_experts,
-    const mluOpTensorDescriptor_t grad_input_desc, void *grad_input) {
-  // gates: (samples)
-  // indices: (samples)
-  // locations: (samples)
-  // dispatch: (num_experts * capacity, hidden)
-  // grad_input: (samples, hidden)
-
-  const std::string API = "[mluOpMoeDispatchBackwardData]";
-  // check desc
-  PARAM_CHECK(API, handle != NULL);
-  // check arch
-  if (handle->arch < MLUOP_MLU370) {
-    LOG(ERROR) << API
-               << "The operator does not match the current architecture.";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-  PARAM_CHECK(API, gates_desc != NULL);
-  PARAM_CHECK(API, indices_desc != NULL);
-  PARAM_CHECK(API, locations_desc != NULL);
-  PARAM_CHECK(API, dispatch_desc != NULL);
-  PARAM_CHECK(API, grad_input_desc != NULL);
-
-  // check dim
-  PARAM_CHECK_EQ(API, gates_desc->dim, 1);
-  PARAM_CHECK_EQ(API, indices_desc->dim, 1);
-  PARAM_CHECK_EQ(API, locations_desc->dim, 1);
-  PARAM_CHECK_EQ(API, dispatch_desc->dim, 2);
-  PARAM_CHECK_EQ(API, grad_input_desc->dim, 2);
-
-  // check shape
-  PARAM_CHECK_EQ(API, gates_desc->dims[0], samples);
-  PARAM_CHECK_EQ(API, indices_desc->dims[0], samples);
-  PARAM_CHECK_EQ(API, locations_desc->dims[0], samples);
-  PARAM_CHECK_EQ(API, dispatch_desc->dims[0], (num_experts * capacity));
-  PARAM_CHECK_EQ(API, dispatch_desc->dims[1], hidden);
-  PARAM_CHECK_EQ(API, grad_input_desc->dims[0], samples);
-  PARAM_CHECK_EQ(API, grad_input_desc->dims[1], hidden);
-
-  // check dtype
-  PARAM_CHECK_V2(API, (gates_desc->dtype == MLUOP_DTYPE_FLOAT),
-                 "Only float are supported in input tensor, but the "
-                 "data type of tensor is "
-                     << mluOpGetNameOfDataType(gates_desc->dtype) << ".");
-  PARAM_CHECK_V2(API, (indices_desc->dtype == MLUOP_DTYPE_INT32),
-                 "Only int32 are supported in indices tensor, but the data "
-                 "type of tensor is "
-                     << mluOpGetNameOfDataType(indices_desc->dtype) << ".");
-  PARAM_CHECK_V2(API, (locations_desc->dtype == MLUOP_DTYPE_INT32),
-                 "Only int32 are supported in locations tensor, but the data "
-                 "type of tensor is "
-                     << mluOpGetNameOfDataType(locations_desc->dtype) << ".");
-  PARAM_CHECK(API, dispatch_desc->dtype == gates_desc->dtype);
-  PARAM_CHECK(API, grad_input_desc->dtype == gates_desc->dtype);
-
-  // check tensor dim
-  PARAM_CHECK(API, samples >= 0);
-  PARAM_CHECK(API, capacity >= 0);
-  PARAM_CHECK(API, hidden >= 0);
-  PARAM_CHECK(API, num_experts >= 0);
-
-  const uint64_t gates_element_num = mluOpGetTensorElementNum(gates_desc);
-  const uint64_t indices_element_num = mluOpGetTensorElementNum(indices_desc);
-  const uint64_t locations_element_num =
-      mluOpGetTensorElementNum(locations_desc);
-  const uint64_t dispatch_element_num = mluOpGetTensorElementNum(dispatch_desc);
-  const uint64_t grad_input_element_num =
-      mluOpGetTensorElementNum(grad_input_desc);
-
-  // check large tensor
-  TENSOR_NUM_CHECK(API, gates_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(API, indices_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(API, locations_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(API, dispatch_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(API, grad_input_element_num, LARGE_TENSOR_NUM, "");
-
-  // check zero element
-  if (samples == 0 || hidden == 0) {
-    VLOG(5) << API << "Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  } else {
-    // Initialize output space
-    PARAM_CHECK(API, grad_input != NULL);
-    const size_t grad_input_initial_value = 0x00;
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_input_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST,
-                          &grad_input_initial_value, cnnl_output_desc,
-                          grad_input));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-    VLOG(5) << API << "Initialize output tensor done.";
-  }
-
-  // check zero element
-  if (capacity == 0 || num_experts == 0) {
-    VLOG(5) << API << "Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // check ptr
-  PARAM_CHECK(API, gates != NULL);
-  PARAM_CHECK(API, indices != NULL);
-  PARAM_CHECK(API, locations != NULL);
-  PARAM_CHECK(API, dispatch != NULL);
-  PARAM_CHECK(API, grad_input != NULL);
-
-  VLOG(5) << API << "input data shape: "
-          << "samples = " << samples << ", "
-          << "capacity = " << capacity << ", "
-          << "hidden = " << hidden << ", "
-          << "num_experts = " << num_experts;
-
-  // generate prototxt start!
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("moe_dispatch_backward_data");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA(true, "gates", gates, gates_desc, 100, -100);
-    GEN_CASE_DATA_REAL(true, "indices", indices, indices_desc);
-    GEN_CASE_DATA_REAL(true, "locations", locations, locations_desc);
-    GEN_CASE_DATA(true, "dispatch", dispatch, dispatch_desc, 100, -100);
-    GEN_CASE_DATA(false, "grad_input", grad_input, grad_input_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "moe_dispatch_backward_data", "samples",
-                             samples);
-    GEN_CASE_OP_PARAM_SINGLE(1, "moe_dispatch_backward_data", "capacity",
-                             capacity);
-    GEN_CASE_OP_PARAM_SINGLE(2, "moe_dispatch_backward_data", "hidden", hidden);
-    GEN_CASE_OP_PARAM_SINGLE(3, "moe_dispatch_backward_data", "num_experts",
-                             num_experts);
-    GEN_CASE_TEST_PARAM_NEW(false, false, true, 0, 0, 0.0);
-  }
-  // generate prototxt end!
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  PolicyFunc(handle, &k_dim, &k_type);
-
-  int core_num_per_cluster =
-      mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  VLOG(5) << API << "Launch Kernel <<<Union" << k_type / core_num_per_cluster
-          << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>"
-          << "core num per cluster: " << core_num_per_cluster;
-
-  mluOpDataType_t data_type = grad_input_desc->dtype;
-  uint32_t taskNum = k_dim.x * k_dim.y * k_dim.z;
-
-  if (samples <= taskNum) {
-    VLOG(5) << API << "Launch Kernel KernelMoeDispatchBwdData1().";
-    CHECK_RETURN(
-        "[mluOpMoeDispatchBackwardData1]",
-        KernelMoeDispatchBwdData1(k_dim, k_type, handle->queue, data_type,
-                                  gates, indices, locations, dispatch, samples,
-                                  capacity, hidden, num_experts, grad_input));
-    VLOG(5) << API << "Finish Kernel KernelMoeDispatchBwdData1.";
-  } else {
-    VLOG(5) << API << "Launch Kernel KernelMoeDispatchBwdData2().";
-    CHECK_RETURN(
-        "[mluOpMoeDispatchBackwardData2]",
-        KernelMoeDispatchBwdData2(k_dim, k_type, handle->queue, data_type,
-                                  gates, indices, locations, dispatch, samples,
-                                  capacity, hidden, num_experts, grad_input));
-    VLOG(5) << API << "Finish Kernel KernelMoeDispatchBwdData2.";
-  }
-
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.h b/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.h
deleted file mode 100644
index b392cc217..000000000
--- a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MOE_DISPATCH_BACKWARD_DATA_MOE_DISPATCH_BACKWARD_DATA_H
-#define KERNELS_MOE_DISPATCH_BACKWARD_DATA_MOE_DISPATCH_BACKWARD_DATA_H
-
-#include "mlu_op.h"
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdData1(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *gates, const void *indices,
-    const void *locations, const void *dispatch, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *grad_input);
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdData2(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *gates, const void *indices,
-    const void *locations, const void *dispatch, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *grad_input);
-
-#endif  // KERNELS_MOE_DISPATCH_BACKWARD_DATA_MOE_DISPATCH_BACKWARD_DATA_H
diff --git a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu b/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu
deleted file mode 100644
index 4d2e9d08f..000000000
--- a/kernels/moe_dispatch_backward_data/moe_dispatch_backward_data_union1.mlu
+++ /dev/null
@@ -1,339 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "moe_dispatch_backward_data.h"
-
-#include "core/logging.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-#if __BANG_ARCH__ >= 372
-template <typename T>
-static __mlu_func__ void load(T *dispatch_addr, T *nram_dispatch,
-                              const int deal_num, const int pingpong_num,
-                              const int pi) {
-  int offset = (pi % 2) * pingpong_num;
-  T *nram_dispatch_p = nram_dispatch + offset;
-  __memcpy_async(nram_dispatch_p, dispatch_addr, deal_num * sizeof(T),
-                 GDRAM2NRAM);
-}
-
-template <typename T>
-static __mlu_func__ void compute(T *nram_gard_input, T *nram_dispatch,
-                                 const T gates_value, const int deal_num,
-                                 const int pingpong_num, const int pi) {
-  int offset = (pi % 2) * pingpong_num;
-  T *nram_gard_input_p = nram_gard_input + offset;
-  T *nram_dispatch_p = nram_dispatch + offset;
-  __bang_mul_scalar(nram_gard_input_p, nram_dispatch_p, gates_value, deal_num);
-}
-
-template <typename T>
-static __mlu_func__ void store(T *grad_input_addr, T *nram_grad_input,
-                               const int deal_num, const int pingpong_num,
-                               const int pi) {
-  int offset = (pi % 2) * pingpong_num;
-  T *nram_grad_input_p = nram_grad_input + offset;
-  __memcpy_async(grad_input_addr, nram_grad_input_p, deal_num * sizeof(T),
-                 NRAM2GDRAM);
-}
-
-template <typename T>
-static __mlu_func__ void lcs(T *base_gard_input_addr, T *base_dispatch_addr,
-                             T *nram_gard_input, T *nram_dispatch,
-                             const T gates_value, const int repeat_num,
-                             const int rem_num, const int deal_num,
-                             const int pingpong_num) {
-  if (repeat_num > 0) {
-    // L[0]
-    T *dispatch_addr = base_dispatch_addr;
-    load(dispatch_addr, nram_dispatch, deal_num, pingpong_num, 0);
-    __sync();
-  }
-  if (repeat_num > 1) {
-    // L[1]
-    T *dispatch_addr = base_dispatch_addr + deal_num;
-    load(dispatch_addr, nram_dispatch, deal_num, pingpong_num, 1);
-    // C[0]
-    compute(nram_gard_input, nram_dispatch, gates_value, deal_num, pingpong_num,
-            0);
-    __sync();
-  }
-  for (int n_iter = 0; n_iter < repeat_num - 2; ++n_iter) {
-    // S[n_iter]
-    T *gard_input_addr = base_gard_input_addr + n_iter * deal_num;
-    store(gard_input_addr, nram_gard_input, deal_num, pingpong_num, n_iter);
-    // L[n_iter + 2]
-    T *dispatch_addr = base_dispatch_addr + (n_iter + 2) * deal_num;
-    load(dispatch_addr, nram_dispatch, deal_num, pingpong_num, n_iter + 2);
-    // C[n_iter + 1]
-    compute(nram_gard_input, nram_dispatch, gates_value, deal_num, pingpong_num,
-            n_iter + 1);
-    __sync();
-  }
-  if (repeat_num >= 2) {
-    // S[repeat_num - 2]
-    T *gard_input_addr = base_gard_input_addr + (repeat_num - 2) * deal_num;
-    store(gard_input_addr, nram_gard_input, deal_num, pingpong_num,
-          repeat_num - 2);
-  }
-  if (rem_num > 0) {
-    // L[repeat_num]
-    T *dispatch_addr = base_dispatch_addr + repeat_num * deal_num;
-    load(dispatch_addr, nram_dispatch, rem_num, pingpong_num, repeat_num);
-  }
-  if (repeat_num > 0) {
-    // C[repeat_num - 1]
-    compute(nram_gard_input, nram_dispatch, gates_value, deal_num, pingpong_num,
-            repeat_num - 1);
-  }
-  __sync();
-  if (repeat_num > 0) {
-    // S[repeat_num - 1]
-    T *gard_input_addr = base_gard_input_addr + (repeat_num - 1) * deal_num;
-    store(gard_input_addr, nram_gard_input, deal_num, pingpong_num,
-          repeat_num - 1);
-  }
-  if (rem_num > 0) {
-    // C[repeat_num]
-    compute(nram_gard_input, nram_dispatch, gates_value, rem_num, pingpong_num,
-            repeat_num);
-    __sync();
-    // S[repeat_num]
-    T *gard_input_addr = base_gard_input_addr + repeat_num * deal_num;
-    store(gard_input_addr, nram_gard_input, rem_num, pingpong_num, repeat_num);
-  }
-}
-#endif
-
-template <typename T>
-__mlu_entry__ void MLUKernelMoeDispatchBwdData1(
-    const T *gates, const int *indices, const int *locations, const T *dispatch,
-    const int samples, const int capacity, const int hidden,
-    const int num_experts, T *grad_input) {
-  // gates: (samples)
-  // indices: (samples)
-  // locations: (samples)
-  // dispatch: (num_experts * capacity, hidden)
-  // grad_input: (samples, hidden)
-#if __BANG_ARCH__ >= 372
-  if (__is_mpu()) {
-    return;
-  }
-  int one_sample_task_num = taskDim / samples;
-  int rem_task = taskDim % samples;
-  int sample_idx = 0;
-  if ((rem_task > 0) && (taskId < (one_sample_task_num + 1) * rem_task)) {
-    sample_idx = (int)(taskId / (one_sample_task_num + 1));
-    one_sample_task_num = one_sample_task_num + 1;
-  } else {
-    sample_idx = (int)((taskId - rem_task) / one_sample_task_num);
-  }
-  int indices_value = indices[sample_idx];
-  int location_value = locations[sample_idx];
-  if (indices_value < 0 || indices_value >= num_experts || location_value < 0 ||
-      location_value >= capacity) {
-    return;
-  }
-  T gates_si_value = gates[sample_idx];
-  int logic_tid = taskId % one_sample_task_num;
-  int hidden_per_task = hidden / one_sample_task_num;
-  int rem_hidden_num = hidden % one_sample_task_num;
-  int hidden_seg_num = hidden_per_task + (int)(logic_tid < rem_hidden_num);
-  if (hidden_seg_num == 0) {
-    return;
-  }
-  int hidden_data_offset =
-      logic_tid * hidden_per_task +
-      ((logic_tid < rem_hidden_num) ? logic_tid : rem_hidden_num);
-  // | nram space partion       | data num |
-  // | ------------------------ | -------- |
-  // | nram_grad_input   ping   |  deal_h  |
-  // | nram_dispatch     ping   |  deal_h  |
-  // | nram_grad_input   pong   |  deal_h  |
-  // | nram_dispatch     pong   |  deal_h  |
-  const int max_nram_num = MAX_NRAM_SIZE / sizeof(T);
-  const int deal_h = max_nram_num / 4;
-  const int pingpong_num = 2 * deal_h;
-  T *nram_grad_input = (T *)nram_buffer;
-  T *nram_dispatch = nram_grad_input + deal_h;
-  int grad_input_addr_offset = sample_idx * hidden + hidden_data_offset;
-  T *base_grad_input_addr = (T *)grad_input + grad_input_addr_offset;
-  int dispatch_idx_offset =
-      (indices_value * capacity + location_value) * hidden;
-  T *base_dispatch_addr =
-      (T *)dispatch + dispatch_idx_offset + hidden_data_offset;
-  int repeat_h = hidden_seg_num / deal_h;
-  int rem_h = hidden_seg_num % deal_h;
-  lcs(base_grad_input_addr, base_dispatch_addr, nram_grad_input, nram_dispatch,
-      gates_si_value, repeat_h, rem_h, deal_h, pingpong_num);
-#endif
-}
-
-template <typename T>
-__mlu_entry__ void MLUKernelMoeDispatchBwdData2(
-    const T *gates, const int *indices, const int *locations, const T *dispatch,
-    const int samples, const int capacity, const int hidden,
-    const int num_experts, T *grad_input) {
-  // gates: (samples)
-  // indices: (samples)
-  // locations: (samples)
-  // dispatch: (num_experts * capacity, hidden)
-  // grad_input: (samples, hidden)
-#if __BANG_ARCH__ >= 372
-  if (__is_mpu()) {
-    return;
-  }
-  int per_task_sample_num = samples / taskDim;
-  int rem_sample_num = samples % taskDim;
-  int samples_num = per_task_sample_num + (int)((taskId < rem_sample_num));
-  int sample_idx = taskId * per_task_sample_num +
-                   ((taskId < rem_sample_num) ? taskId : rem_sample_num);
-  int max_deal_h =
-      (MAX_NRAM_SIZE - 4 * sizeof(int) - 1 * sizeof(T)) / 2 / sizeof(T);
-  int deal_h = 0;
-  int deal_s = 0;
-  if (hidden > max_deal_h) {
-    deal_s = 1;
-    deal_h = max_deal_h;
-  } else {
-    deal_h = hidden;
-    deal_s = (MAX_NRAM_SIZE - 2 * deal_h * sizeof(T)) /
-             (1 * sizeof(T) + 4 * sizeof(int));
-  }
-  // | nram space partion       | data num |
-  // | ------------------------ | -------- |
-  // | nram_gates               |  deal_s  |
-  // | nram_dispatch_idx_offset |  deal_s  |
-  // | nram_mask                |  deal_s  |
-  // | nram_indices             |  deal_s  |
-  // | nram_locations           |  deal_s  |
-  // | nram_grad_input          |  deal_h  |
-  // | nram_dispatch            |  deal_h  |
-  T *nram_gates = (T *)nram_buffer;
-  int *nram_dispatch_idx_offset = (int *)(nram_gates + deal_s);
-  int *nram_mask = nram_dispatch_idx_offset + deal_s;
-  int *nram_indices = nram_mask + deal_s;
-  int *nram_locations = nram_indices + deal_s;
-  T *nram_grad_input = (T *)(nram_locations + deal_s);
-  T *nram_dispatch = nram_grad_input + deal_h;
-  int repeat_s = samples_num / deal_s;
-  int rem_s = samples_num % deal_s;
-  int repeat_h = hidden / deal_h;
-  int rem_h = hidden % deal_h;
-  // get gdram input gates indices locations offset
-  T *base_gates = (T *)gates + sample_idx;
-  int *base_indices = (int *)indices + sample_idx;
-  int *base_locations = (int *)locations + sample_idx;
-  // get gdram output grad_input offset
-  int grad_input_offset = sample_idx * hidden;
-  T *base_grad_input = (T *)grad_input + grad_input_offset;
-  for (int s_iter = 0; s_iter <= repeat_s; ++s_iter) {
-    int deal_s_num = (s_iter == repeat_s) ? rem_s : deal_s;
-    if (deal_s_num == 0) {
-      break;
-    }
-    // load gates indices locations
-    T *base_gates_s = base_gates + s_iter * deal_s;
-    int *base_indices_s = base_indices + s_iter * deal_s;
-    int *base_locations_s = base_locations + s_iter * deal_s;
-    __memcpy(nram_gates, base_gates_s, deal_s_num * sizeof(T), GDRAM2NRAM);
-    __memcpy(nram_indices, base_indices_s, deal_s_num * sizeof(int),
-             GDRAM2NRAM);
-    __memcpy(nram_locations, base_locations_s, deal_s_num * sizeof(int),
-             GDRAM2NRAM);
-    // dispatch idx = (nram_indices * capacity + nram_locations) * hidden
-    __bang_mul_scalar(nram_dispatch_idx_offset, nram_indices, capacity,
-                      deal_s_num);
-    __bang_add(nram_dispatch_idx_offset, nram_dispatch_idx_offset,
-               nram_locations, deal_s_num);
-    __bang_mul_scalar(nram_dispatch_idx_offset, nram_dispatch_idx_offset,
-                      hidden, deal_s_num);
-    // 0 <= nram_locations < capacity
-    __bang_ge_scalar(nram_mask, nram_locations, (int)0, deal_s_num);
-    __bang_lt_scalar(nram_locations, nram_locations, capacity, deal_s_num);
-    __bang_and(nram_locations, nram_locations, nram_mask, deal_s_num);
-    // 0 <= nram_indices < num_experts
-    __bang_ge_scalar(nram_mask, nram_indices, (int)0, deal_s_num);
-    __bang_lt_scalar(nram_indices, nram_indices, num_experts, deal_s_num);
-    __bang_and(nram_indices, nram_indices, nram_mask, deal_s_num);
-    __bang_and(nram_mask, nram_indices, nram_locations, deal_s_num);
-    // get output grad_input s offset
-    T *base_grad_input_s = base_grad_input + s_iter * deal_s * hidden;
-    for (int si = 0; si < deal_s_num; ++si) {
-      if (nram_mask[si] != 1) {
-        continue;
-      }
-      T *base_dispatch_si = (T *)dispatch + nram_dispatch_idx_offset[si];
-      T *base_grad_input_s_si = base_grad_input_s + si * hidden;
-      for (int h_iter = 0; h_iter <= repeat_h; ++h_iter) {
-        int deal_h_num = (h_iter == repeat_h) ? rem_h : deal_h;
-        if (deal_h_num == 0) {
-          break;
-        }
-        // get input dispatch h offset
-        T *base_dispatch_si_h = base_dispatch_si + h_iter * deal_h;
-        // get output grad_input s h offset
-        T *base_grad_input_s_si_h = base_grad_input_s_si + h_iter * deal_h;
-        __memcpy(nram_dispatch, base_dispatch_si_h, deal_h_num * sizeof(T),
-                 GDRAM2NRAM);
-        __bang_mul_scalar(nram_grad_input, nram_dispatch, nram_gates[si],
-                          deal_h_num);
-        // store grad_input
-        __memcpy(base_grad_input_s_si_h, nram_grad_input,
-                 deal_h_num * sizeof(T), NRAM2GDRAM);
-      }  // repeat h
-    }    // repeat deal_s_num
-  }      // repeat s
-#endif
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdData1(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *gates, const void *indices,
-    const void *locations, const void *dispatch, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *grad_input) {
-  /* Only float data type is supported in host-side CPP file
-     fool-proof processing.*/
-  KERNEL_CHECK(MLUKernelMoeDispatchBwdData1<<<k_dim, k_type, queue>>>(
-      (float *)gates, (int *)indices, (int *)locations, (float *)dispatch,
-      samples, capacity, hidden, num_experts, (float *)grad_input));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdData2(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *gates, const void *indices,
-    const void *locations, const void *dispatch, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *grad_input) {
-  /* Only float data type is supported in host-side CPP file
-     fool-proof processing.*/
-  KERNEL_CHECK(MLUKernelMoeDispatchBwdData2<<<k_dim, k_type, queue>>>(
-      (float *)gates, (int *)indices, (int *)locations, (float *)dispatch,
-      samples, capacity, hidden, num_experts, (float *)grad_input));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp b/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp
deleted file mode 100644
index cbcd6c21f..000000000
--- a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "moe_dispatch_backward_gate.h"
-
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-
-static void policyFunc(const mluOpHandle_t handle, const int samples,
-                       cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
-  int max_core_num = mluop::runtime::getCoreNumOfJobLimitCapability(handle);
-  k_dim->x = max_core_num;
-  k_dim->y = 1;
-  k_dim->z = 1;
-  if (samples > max_core_num) {
-    *k_type = CNRT_FUNC_TYPE_UNION1;
-  } else {
-    *k_type = mluop::runtime::getJobLimitCapabilityCnrtFuncType(handle);
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetMoeDispatchBackwardGateWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc,
-    size_t *workspace_size) {
-  PARAM_CHECK("[mluOpMoeDispatchBackwardGate]", handle != NULL);
-  // platform check
-  if (handle->arch < MLUOP_MLU370) {
-    LOG(ERROR) << "[mluOpMoeDispatchBackwardGate] Only mlu300 and above "
-                  "devices are supported. "
-               << "Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-  PARAM_CHECK("[mluOpMoeDispatchBackwardGate]", input_desc != NULL);
-  PARAM_CHECK("[mluOpMoeDispatchBackwardGate]", workspace_size != NULL);
-
-  int samples = input_desc->dims[0];
-  *workspace_size = 0;
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(handle, samples, &k_dim, &k_type);
-  int taskNum = k_dim.x * k_dim.y * k_dim.z;
-  if ((samples > 0) && (samples < taskNum)) {
-    *workspace_size = taskNum * mluop::getSizeOfDataType(input_desc->dtype);
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t moeDispatchBackwardGateParamCheck(
-    const std::string &op_name, const mluOpHandle_t handle,
-    const mluOpTensorDescriptor_t indices_desc, const void *indices,
-    const mluOpTensorDescriptor_t locations_desc, const void *locations,
-    const mluOpTensorDescriptor_t input_desc, const void *input,
-    const mluOpTensorDescriptor_t dispatch_desc, const void *dispatch,
-    const int samples, const int capacity, const int hidden,
-    const int num_experts, void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t grad_gates_desc, const void *grad_gates,
-    bool *zero_element) {
-  // check descriptor and data
-  PARAM_CHECK(op_name, handle != NULL);
-  // platform check
-  if (handle->arch < MLUOP_MLU370) {
-    LOG(ERROR) << op_name << "Only mlu300 and above devices are supported. "
-               << "Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-
-  PARAM_CHECK(op_name, indices_desc != NULL);
-  PARAM_CHECK(op_name, locations_desc != NULL);
-  PARAM_CHECK(op_name, input_desc != NULL);
-  PARAM_CHECK(op_name, dispatch_desc != NULL);
-  PARAM_CHECK(op_name, grad_gates_desc != NULL);
-
-  // check shape
-  PARAM_CHECK(op_name, indices_desc->dim == 1);
-  PARAM_CHECK(op_name, locations_desc->dim == 1);
-  PARAM_CHECK(op_name, input_desc->dim == 2);
-  PARAM_CHECK(op_name, dispatch_desc->dim == 2);
-  PARAM_CHECK(op_name, grad_gates_desc->dim == 1);
-
-  // check data type
-  PARAM_CHECK_V2(op_name, (indices_desc->dtype == MLUOP_DTYPE_INT32),
-                 "Only int32 are supported in indices tensor, but the data "
-                 "type of tensor is "
-                     << mluOpGetNameOfDataType(indices_desc->dtype) << ".");
-  PARAM_CHECK_V2(op_name, (locations_desc->dtype == MLUOP_DTYPE_INT32),
-                 "Only int32 are supported in locations tensor, but the data "
-                 "type of tensor is "
-                     << mluOpGetNameOfDataType(locations_desc->dtype) << ".");
-
-  // check tensor datatype, support float32
-  PARAM_CHECK_V2(op_name, (input_desc->dtype == MLUOP_DTYPE_FLOAT),
-                 "Only float are supported in input tensor, but the "
-                 "data type of tensor is "
-                     << mluOpGetNameOfDataType(input_desc->dtype) << ".");
-  PARAM_CHECK(op_name, input_desc->dtype == dispatch_desc->dtype);
-  PARAM_CHECK(op_name, input_desc->dtype == grad_gates_desc->dtype);
-
-  // check dim
-  PARAM_CHECK(op_name, samples >= 0);
-  PARAM_CHECK(op_name, capacity >= 0);
-  PARAM_CHECK(op_name, hidden >= 0);
-  PARAM_CHECK(op_name, num_experts >= 0);
-  PARAM_CHECK(op_name, (samples == indices_desc->dims[0]));
-  PARAM_CHECK(op_name, (samples == locations_desc->dims[0]));
-  PARAM_CHECK(op_name, (samples == input_desc->dims[0]));
-  PARAM_CHECK(op_name, (samples == grad_gates_desc->dims[0]));
-  PARAM_CHECK(op_name, ((num_experts * capacity) == dispatch_desc->dims[0]));
-  PARAM_CHECK(op_name, (hidden == input_desc->dims[1]));
-  PARAM_CHECK(op_name, (hidden == dispatch_desc->dims[1]));
-
-  const size_t indices_element_num = mluOpGetTensorElementNum(indices_desc);
-  const size_t locations_element_num = mluOpGetTensorElementNum(locations_desc);
-  const size_t input_element_num = mluOpGetTensorElementNum(input_desc);
-  const size_t dispatch_element_num = mluOpGetTensorElementNum(dispatch_desc);
-  const size_t grad_gates_element_num =
-      mluOpGetTensorElementNum(grad_gates_desc);
-
-  // check large tensor
-  TENSOR_NUM_CHECK(op_name, indices_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(op_name, locations_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(op_name, input_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(op_name, dispatch_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(op_name, grad_gates_element_num, LARGE_TENSOR_NUM, "");
-
-  // check element num zero
-  if (indices_element_num == 0 || locations_element_num == 0 ||
-      input_element_num == 0 || dispatch_element_num == 0 ||
-      grad_gates_element_num == 0) {
-    *zero_element = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // check workspace ptr
-  if (workspace_size > 0) {
-    PARAM_CHECK(op_name, workspace != NULL);
-  }
-
-  // input and output ptr check null
-  PARAM_CHECK(op_name, indices != NULL);
-  PARAM_CHECK(op_name, locations != NULL);
-  PARAM_CHECK(op_name, input != NULL);
-  PARAM_CHECK(op_name, dispatch != NULL);
-  PARAM_CHECK(op_name, grad_gates != NULL);
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpMoeDispatchBackwardGate(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t indices_desc,
-    const void *indices, const mluOpTensorDescriptor_t locations_desc,
-    const void *locations, const mluOpTensorDescriptor_t input_desc,
-    const void *input, const mluOpTensorDescriptor_t dispatch_desc,
-    const void *dispatch, const int samples, const int capacity,
-    const int hidden, const int num_experts, void *workspace,
-    const size_t workspace_size, const mluOpTensorDescriptor_t grad_gates_desc,
-    void *grad_gates) {
-  // check params
-  bool zero_element = false;
-  mluOpStatus_t param_check = moeDispatchBackwardGateParamCheck(
-      "[mluOpMoeDispatchBackwardGate]", handle, indices_desc, indices,
-      locations_desc, locations, input_desc, input, dispatch_desc, dispatch,
-      samples, capacity, hidden, num_experts, workspace, workspace_size,
-      grad_gates_desc, grad_gates, &zero_element);
-  if (param_check != MLUOP_STATUS_SUCCESS) {
-    return param_check;
-  }
-
-  // check zero element
-  if (zero_element == true) {
-    VLOG(5) << "[mluOpMoeDispatchBackwardGate] Skip zero element tensor.";
-    if (samples > 0) {
-      VLOG(5) << "cnnlFill_v3 start.";
-      const size_t fill_value = 0x0;
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_gates_desc,
-                                                   cnnl_output_desc);
-      CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                            cnnl_output_desc, grad_gates));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-      VLOG(5) << "cnnlFill_v3 end.";
-    }
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("moe_dispatch_backward_gate");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA_REAL(true, "indices", indices, indices_desc);
-    GEN_CASE_DATA_REAL(true, "locations", locations, locations_desc);
-    GEN_CASE_DATA(true, "input", input, input_desc, 0, 0);
-    GEN_CASE_DATA(true, "dispatch", dispatch, dispatch_desc, 0, 0);
-    GEN_CASE_DATA(false, "grad_gates", grad_gates, grad_gates_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "moe_dispatch_backward_gate", "samples",
-                             samples);
-    GEN_CASE_OP_PARAM_SINGLE(1, "moe_dispatch_backward_gate", "capacity",
-                             capacity);
-    GEN_CASE_OP_PARAM_SINGLE(2, "moe_dispatch_backward_gate", "hidden", hidden);
-    GEN_CASE_OP_PARAM_SINGLE(3, "moe_dispatch_backward_gate", "num_experts",
-                             num_experts);
-    GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-  }
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(handle, samples, &k_dim, &k_type);
-  VLOG(5) << "Launch Kernel mluOpMoeDispatchBackwardGate<<<Union"
-          << k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", "
-          << k_dim.z << ">>>";
-  mluOpDataType_t data_type = input_desc->dtype;
-  uint32_t taskNum = k_dim.x * k_dim.y * k_dim.z;
-  if (samples <= taskNum) {
-    VLOG(5) << "[mluOpMoeDispatchBackwardGate] launch "
-               "KernelMoeDispatchBwdGate1";
-    CHECK_RETURN("[mluOpMoeDispatchBackwardGate1]",
-                 KernelMoeDispatchBwdGate1(k_dim, k_type, handle->queue,
-                                           data_type, indices, locations, input,
-                                           dispatch, samples, capacity, hidden,
-                                           num_experts, workspace, grad_gates));
-  } else {
-    VLOG(5) << "[mluOpMoeDispatchBackwardGate] launch "
-               "KernelMoeDispatchBwdGate2";
-    CHECK_RETURN(
-        "[mluOpMoeDispatchBackwardGate2]",
-        KernelMoeDispatchBwdGate2(k_dim, k_type, handle->queue, data_type,
-                                  indices, locations, input, dispatch, samples,
-                                  capacity, hidden, num_experts, grad_gates));
-  }
-
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.h b/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.h
deleted file mode 100644
index b3429884d..000000000
--- a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MOE_DISPATCH_BACKWARD_GATE_MOE_DISPATCH_BACKWARD_GATE_H
-#define KERNELS_MOE_DISPATCH_BACKWARD_GATE_MOE_DISPATCH_BACKWARD_GATE_H
-
-#include "mlu_op.h"
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdGate1(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *indices, const void *locations,
-    const void *input, const void *dispatch, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *workspace, void *grad_gates);
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdGate2(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *indices, const void *locations,
-    const void *input, const void *dispatch, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *grad_gates);
-
-#endif  // KERNELS_MOE_DISPATCH_BACKWARD_GATE_MOE_DISPATCH_BACKWARD_GATE_H
diff --git a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu b/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu
deleted file mode 100644
index 383c97d0a..000000000
--- a/kernels/moe_dispatch_backward_gate/moe_dispatch_backward_gate_union1.mlu
+++ /dev/null
@@ -1,387 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subh_iterect to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "moe_dispatch_backward_gate.h"
-
-#include "core/logging.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-#if __BANG_ARCH__ >= 372
-template <typename T>
-static __mlu_func__ void load(const T *input_addr, const T *dispatch_addr,
-                              T *nram_input, T *nram_dispatch,
-                              const int deal_num, const int pingpong_num,
-                              const int pi) {
-  int offset = (pi % 2) * pingpong_num;
-  T *nram_input_p = nram_input + offset;
-  T *nram_dispatch_p = nram_dispatch + offset;
-  __memcpy_async(nram_input_p, input_addr, deal_num * sizeof(T), GDRAM2NRAM);
-  __memcpy_async(nram_dispatch_p, dispatch_addr, deal_num * sizeof(T),
-                 GDRAM2NRAM);
-}
-
-template <typename T>
-static __mlu_func__ void compute(T *nram_input, T *nram_dispatch, T *gard_gates,
-                                 const int deal_num, const int pingpong_num,
-                                 const int pi) {
-  int offset = (pi % 2) * pingpong_num;
-  T *nram_input_p = nram_input + offset;
-  T *nram_dispatch_p = nram_dispatch + offset;
-  __bang_mul(nram_input_p, nram_input_p, nram_dispatch_p, deal_num);
-  if (deal_num > 1) {
-    __bang_sumpool(nram_input_p, nram_input_p, 1, 1, deal_num, 1, deal_num, 1,
-                   1);
-  }
-  *gard_gates += nram_input_p[0];
-}
-
-template <typename T>
-static __mlu_func__ void lcs(T *base_input_addr, T *base_dispatch_addr,
-                             T *nram_input, T *nram_dispatch, T *gard_gates,
-                             const int repeat_num, const int rem_num,
-                             const int deal_num, const int pingpong_num) {
-  if (repeat_num > 0) {
-    // L
-    T *input_addr = base_input_addr;
-    T *dispatch_addr = base_dispatch_addr;
-    load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_num,
-         pingpong_num, 0);
-    __sync();
-  }
-
-  if (repeat_num > 1) {
-    // L
-    T *input_addr = base_input_addr + deal_num;
-    T *dispatch_addr = base_dispatch_addr + deal_num;
-    load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_num,
-         pingpong_num, 1);
-
-    // C
-    compute(nram_input, nram_dispatch, gard_gates, deal_num, pingpong_num, 0);
-    __sync();
-  }
-
-  for (int n_iter = 0; n_iter < repeat_num - 2; n_iter++) {
-    // L
-    T *input_addr = base_input_addr + (n_iter + 2) * deal_num;
-    T *dispatch_addr = base_dispatch_addr + (n_iter + 2) * deal_num;
-    load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_num,
-         pingpong_num, n_iter + 2);
-
-    // C
-    compute(nram_input, nram_dispatch, gard_gates, deal_num, pingpong_num,
-            n_iter + 1);
-    __sync();
-  }
-
-  if (rem_num > 0) {
-    // L
-    T *input_addr = base_input_addr + repeat_num * deal_num;
-    T *dispatch_addr = base_dispatch_addr + repeat_num * deal_num;
-    load(input_addr, dispatch_addr, nram_input, nram_dispatch, rem_num,
-         pingpong_num, repeat_num);
-  }
-  if (repeat_num > 0) {
-    // C
-    compute(nram_input, nram_dispatch, gard_gates, deal_num, pingpong_num,
-            repeat_num - 1);
-  }
-  __sync();
-
-  if (rem_num > 0) {
-    // C
-    compute(nram_input, nram_dispatch, gard_gates, rem_num, pingpong_num,
-            repeat_num);
-    __sync();
-  }
-}
-#endif
-
-template <typename T>
-__mlu_global__ void MLUKernelMoeDispatchBwdGate1(
-    const int *indices, const int *locations, const T *input, const T *dispatch,
-    const int samples, const int capacity, const int hidden,
-    const int num_experts, T *workspace, T *grad_gates) {
-#if __BANG_ARCH__ >= 372
-  if (__is_mpu()) {
-    return;
-  }
-
-  int one_sample_task_num = taskDim / samples;
-  int rem_task = taskDim % samples;
-  int sample_idx = 0;
-  if ((rem_task > 0) && (taskId < (one_sample_task_num + 1) * rem_task)) {
-    sample_idx = (int)(taskId / (one_sample_task_num + 1));
-    one_sample_task_num = one_sample_task_num + 1;
-  } else {
-    sample_idx = (int)((taskId - rem_task) / one_sample_task_num);
-  }
-
-  int indice = indices[sample_idx];
-  int location = locations[sample_idx];
-  T gard_gates_temp = (T)0.0;
-
-  if (location >= 0 && location < capacity && indice >= 0 &&
-      indice < num_experts) {
-    int logic_tid = taskId % one_sample_task_num;
-    int hidden_per_task = hidden / one_sample_task_num;
-    int rem_hidden_num = hidden % one_sample_task_num;
-    int hidden_seg_num = hidden_per_task + (int)(logic_tid < rem_hidden_num);
-    int hidden_data_offset =
-        logic_tid * hidden_per_task +
-        ((logic_tid < rem_hidden_num) ? logic_tid : rem_hidden_num);
-
-    if (hidden_seg_num > 0) {
-      // nram space
-      // ping/pong: |nram_input|nram_dispatch|
-      int max_nram_num = MAX_NRAM_SIZE / sizeof(T);
-      int deal_h = max_nram_num / 4;
-      int pingpong_num = 2 * deal_h;
-
-      T *nram_input = (T *)nram_buffer;
-      T *nram_dispatch = nram_input + deal_h;
-
-      int input_addr_offset = sample_idx * hidden + hidden_data_offset;
-      T *base_input_addr = (T *)input + input_addr_offset;
-      int idx = (indice * capacity + location) * hidden;
-      T *base_dispatch_addr = (T *)dispatch + idx + hidden_data_offset;
-
-      int repeat_h = hidden_seg_num / deal_h;
-      int rem_h = hidden_seg_num % deal_h;
-      lcs(base_input_addr, base_dispatch_addr, nram_input, nram_dispatch,
-          &gard_gates_temp, repeat_h, rem_h, deal_h, pingpong_num);
-    }
-  }
-
-  if (samples == taskDim) {
-    grad_gates[sample_idx] = gard_gates_temp;
-    return;
-  } else {
-    workspace[taskId] = gard_gates_temp;
-  }
-  __sync_all_ipu();
-
-  if ((samples < taskDim) && (taskId == 0)) {
-    T *nram_grad_gates = (T *)nram_buffer;
-    __bang_write_zero(nram_grad_gates, samples);
-
-    if (samples > 1) {
-      int one_sample_task_num = taskDim / samples;
-      int rem_task = taskDim % samples;
-      int sample_idx = 0;
-      for (int ti = 0; ti < taskDim; ti++) {
-        if ((rem_task > 0) && (ti < (one_sample_task_num + 1) * rem_task)) {
-          sample_idx = (int)(ti / (one_sample_task_num + 1));
-        } else {
-          sample_idx = (int)((ti - rem_task) / one_sample_task_num);
-        }
-        nram_grad_gates[sample_idx] += workspace[ti];
-      }
-    } else {
-      __memcpy(nram_grad_gates, workspace, taskDim * sizeof(T), GDRAM2NRAM);
-      __bang_sumpool(nram_grad_gates, nram_grad_gates, 1, 1, taskDim, 1,
-                     taskDim, 1, 1);
-    }
-    // store
-    __memcpy(grad_gates, nram_grad_gates, samples * sizeof(T), NRAM2GDRAM);
-  }
-#endif
-}
-
-template <typename T>
-__mlu_global__ void MLUKernelMoeDispatchBwdGate2(
-    const int *indices, const int *locations, const T *input, const T *dispatch,
-    const int samples, const int capacity, const int hidden,
-    const int num_experts, T *grad_gates) {
-#if __BANG_ARCH__ >= 372
-  if (__is_mpu()) {
-    return;
-  }
-  int per_task_sample_num = samples / taskDim;
-  int rem_sample_num = samples % taskDim;
-  int samples_num = per_task_sample_num + (int)((taskId < rem_sample_num));
-  int sample_idx = taskId * per_task_sample_num +
-                   ((taskId < rem_sample_num) ? taskId : rem_sample_num);
-  // nram space
-  // |nram_indices|nram_location|nram_idx|nram_mask|
-  // ping/pong:|nram_input|nram_dispatch|
-  int max_deal_h = (MAX_NRAM_SIZE - 4 * sizeof(int)) / (4 * sizeof(T));
-  int pingpong_num = 0;
-  int deal_h = 0;
-  int deal_s = 0;
-  if (hidden > max_deal_h) {
-    deal_s = 1;
-    deal_h = max_deal_h;
-  } else {
-    deal_h = hidden;
-    deal_s = (MAX_NRAM_SIZE - 4 * deal_h * sizeof(T)) / (4 * sizeof(int));
-  }
-
-  int *nram_indices = (int *)nram_buffer;
-  int *nram_location = nram_indices + deal_s;
-  int *nram_idx = nram_location + deal_s;
-  int *nram_mask = nram_idx + deal_s;
-  // ping/pong
-  pingpong_num = 2 * deal_h;
-  T *nram_input = (T *)(nram_mask + deal_s);
-  T *nram_dispatch = nram_input + deal_h;
-
-  int repeat_s = samples_num / deal_s;
-  int rem_s = samples_num % deal_s;
-  int repeat_h = hidden / deal_h;
-  int rem_h = hidden % deal_h;
-
-  int *base_indices = (int *)indices + sample_idx;
-  int *base_locations = (int *)locations + sample_idx;
-  int input_addr_offset = sample_idx * hidden;
-  T *base_input = (T *)input + input_addr_offset;
-  T *base_grad_gates = (T *)grad_gates + sample_idx;
-
-  for (int s_iter = 0; s_iter < repeat_s + 1; s_iter++) {
-    int deal_s_num = (s_iter < repeat_s) ? deal_s : rem_s;
-    if (deal_s_num == 0) {
-      break;
-    }
-
-    T *base_input_addr = base_input + s_iter * deal_s * hidden;
-    int *indices_addr = base_indices + s_iter * deal_s;
-    int *locations_addr = base_locations + s_iter * deal_s;
-    __memcpy(nram_indices, indices_addr, deal_s_num * sizeof(int), GDRAM2NRAM);
-    __memcpy(nram_location, locations_addr, deal_s_num * sizeof(int),
-             GDRAM2NRAM);
-
-    // idx = (nram_indices * capacity + nram_location) * hidden
-    __bang_mul_scalar(nram_idx, nram_indices, capacity, deal_s_num);
-    __bang_add(nram_idx, nram_idx, nram_location, deal_s_num);
-    __bang_mul_scalar(nram_idx, nram_idx, hidden, deal_s_num);
-
-    // 0 <= nram_location < capacity
-    __bang_ge_scalar(nram_mask, nram_location, (int)0, deal_s_num);
-    __bang_lt_scalar(nram_location, nram_location, capacity, deal_s_num);
-    __bang_and(nram_mask, nram_mask, nram_location, deal_s_num);
-
-    // 0 <= nram_indices < num_experts
-    __bang_ge_scalar(nram_location, nram_indices, (int)0, deal_s_num);
-    __bang_lt_scalar(nram_indices, nram_indices, num_experts, deal_s_num);
-    __bang_and(nram_mask, nram_mask, nram_location, deal_s_num);
-    __bang_and(nram_mask, nram_mask, nram_indices, deal_s_num);
-
-    T *nram_grad_gates = (T *)nram_indices;
-    __bang_write_zero(nram_grad_gates, deal_s_num);
-
-    if (deal_s_num > 1) {
-      T *base_dispatch_addr = (T *)dispatch;
-
-      // L(si=0)
-      if (nram_mask[0] == 1) {
-        T *input_addr = base_input_addr;
-        T *dispatch_addr = base_dispatch_addr + nram_idx[0];
-        load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_h,
-             pingpong_num, 0);
-        __sync();
-      }
-
-      // L(si=1)
-      if (nram_mask[1] == 1) {
-        T *input_addr = base_input_addr + hidden;
-        T *dispatch_addr = base_dispatch_addr + nram_idx[1];
-        load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_h,
-             pingpong_num, 1);
-      }
-
-      // C(si=0)
-      if (nram_mask[0] == 1) {
-        compute(nram_input, nram_dispatch, nram_grad_gates, deal_h,
-                pingpong_num, 0);
-      }
-      __sync();
-
-      for (int si = 0; si < deal_s_num - 2; si++) {
-        // L(si+2)
-        if (nram_mask[si + 2] == 1) {
-          T *input_addr = base_input_addr + (si + 2) * hidden;
-          T *dispatch_addr = base_dispatch_addr + nram_idx[si + 2];
-          load(input_addr, dispatch_addr, nram_input, nram_dispatch, deal_h,
-               pingpong_num, si + 2);
-        }
-
-        // C(si+1)
-        if (nram_mask[si + 1] == 1) {
-          compute(nram_input, nram_dispatch, nram_grad_gates + (si + 1), deal_h,
-                  pingpong_num, si + 1);
-        }
-        __sync();
-      }
-
-      // C(si=deal_s_num - 1)
-      if (nram_mask[deal_s_num - 1] == 1) {
-        compute(nram_input, nram_dispatch, nram_grad_gates + (deal_s_num - 1),
-                deal_h, pingpong_num, deal_s_num - 1);
-        __sync();
-      }
-    } else {
-      // si = sample_idx + s_iter
-      if (nram_mask[0] == 1) {
-        T *base_dispatch_addr = (T *)dispatch + nram_idx[0];
-        lcs(base_input_addr, base_dispatch_addr, nram_input, nram_dispatch,
-            nram_grad_gates, repeat_h, rem_h, deal_h, pingpong_num);
-      }
-    }
-    // store:
-    __memcpy(base_grad_gates + s_iter * deal_s, nram_grad_gates,
-             deal_s_num * sizeof(T), NRAM2GDRAM);
-  }
-#endif
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdGate1(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *indices, const void *locations,
-    const void *input, const void *dispatch, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *workspace, void *grad_gates) {
-  /* Only float data type is supported in host-side CPP file
-     fool-proof processing.*/
-  KERNEL_CHECK(MLUKernelMoeDispatchBwdGate1<<<k_dim, k_type, queue>>>(
-      (int *)indices, (int *)locations, (float *)input, (float *)dispatch,
-      samples, capacity, hidden, num_experts, (float *)workspace,
-      (float *)grad_gates));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchBwdGate2(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *indices, const void *locations,
-    const void *input, const void *dispatch, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *grad_gates) {
-  /* Only float data type is supported in host-side CPP file
-     fool-proof processing.*/
-  KERNEL_CHECK(MLUKernelMoeDispatchBwdGate2<<<k_dim, k_type, queue>>>(
-      (int *)indices, (int *)locations, (float *)input, (float *)dispatch,
-      samples, capacity, hidden, num_experts, (float *)grad_gates));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/moe_dispatch_forward/moe_dispatch_forward.cpp b/kernels/moe_dispatch_forward/moe_dispatch_forward.cpp
deleted file mode 100644
index 1ea54e088..000000000
--- a/kernels/moe_dispatch_forward/moe_dispatch_forward.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "moe_dispatch_forward.h"
-
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"
-
-static void policyFunc(const mluOpHandle_t handle, cnrtDim3_t *k_dim,
-                       cnrtFunctionType_t *k_type) {
-  // block policy func
-  *k_type = CNRT_FUNC_TYPE_BLOCK;
-  // dimx equals to num of mlu cores in each cluster
-  k_dim->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  // dimy equals to num of current available clusters
-  k_dim->y = mluop::runtime::getClusterLimitCapability(handle);
-  k_dim->z = 1;
-}
-
-static mluOpStatus_t MoeDispatchForwardParamCheck(
-    const std::string &op_name, const mluOpHandle_t handle,
-    const mluOpTensorDescriptor_t gates_desc, const void *gates,
-    const mluOpTensorDescriptor_t indices_desc, const void *indices,
-    const mluOpTensorDescriptor_t locations_desc, const void *locations,
-    const mluOpTensorDescriptor_t input_desc, const void *input,
-    const int samples, const int capacity, const int hidden,
-    const int num_experts, const mluOpTensorDescriptor_t dispatch_desc,
-    void *dispatch, bool *zero_element) {
-  // check descriptor and data
-  PARAM_CHECK(op_name, handle != NULL);
-  // platform check
-  if (handle->arch < MLUOP_MLU370) {
-    LOG(ERROR) << op_name << "Only mlu300 and above devices are supported."
-               << "Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-
-  PARAM_CHECK(op_name, gates_desc != NULL);
-  PARAM_CHECK(op_name, indices_desc != NULL);
-  PARAM_CHECK(op_name, locations_desc != NULL);
-  PARAM_CHECK(op_name, input_desc != NULL);
-  PARAM_CHECK(op_name, dispatch_desc != NULL);
-
-  // check shape
-  PARAM_CHECK(op_name, gates_desc->dim == 1);
-  PARAM_CHECK(op_name, indices_desc->dim == 1);
-  PARAM_CHECK(op_name, locations_desc->dim == 1);
-  PARAM_CHECK(op_name, input_desc->dim == 2);
-  PARAM_CHECK(op_name, dispatch_desc->dim == 2);
-
-  // check data type
-  PARAM_CHECK_V2(op_name, (indices_desc->dtype == MLUOP_DTYPE_INT32),
-                 "Only int32 are supported in indices tensor, but the data "
-                 "type of tensor is "
-                     << mluOpGetNameOfDataType(indices_desc->dtype) << ".");
-  PARAM_CHECK_V2(op_name, (locations_desc->dtype == MLUOP_DTYPE_INT32),
-                 "Only int32 are supported in locations tensor, but the data "
-                 "type of tensor is "
-                     << mluOpGetNameOfDataType(locations_desc->dtype) << ".");
-
-  // check tensor datatype, support float32
-  PARAM_CHECK_V2(op_name, input_desc->dtype == MLUOP_DTYPE_FLOAT,
-                 "Only float are supported in input tensor, but the "
-                 "data type of tensor is "
-                     << mluOpGetNameOfDataType(input_desc->dtype) << ".");
-  PARAM_CHECK(op_name, input_desc->dtype == dispatch_desc->dtype);
-  PARAM_CHECK(op_name, input_desc->dtype == gates_desc->dtype);
-
-  // check dim
-  PARAM_CHECK(op_name, samples >= 0);
-  PARAM_CHECK(op_name, capacity >= 0);
-  PARAM_CHECK(op_name, hidden >= 0);
-  PARAM_CHECK(op_name, num_experts >= 0);
-  PARAM_CHECK(op_name, (samples == gates_desc->dims[0]));
-  PARAM_CHECK(op_name, (samples == indices_desc->dims[0]));
-  PARAM_CHECK(op_name, (samples == locations_desc->dims[0]));
-  PARAM_CHECK(op_name, (samples == input_desc->dims[0]));
-  PARAM_CHECK(op_name, ((num_experts * capacity) == dispatch_desc->dims[0]));
-  PARAM_CHECK(op_name, (hidden == input_desc->dims[1]));
-  PARAM_CHECK(op_name, (hidden == dispatch_desc->dims[1]));
-
-  // check correlation of parameters
-  PARAM_CHECK_V2(op_name, samples <= (num_experts * capacity),
-                 "The samples must be less than or equal to the "
-                 "multiplication result of the capacity and num_experts");
-
-  const size_t indices_element_num = mluOpGetTensorElementNum(indices_desc);
-  const size_t locations_element_num = mluOpGetTensorElementNum(locations_desc);
-  const size_t input_element_num = mluOpGetTensorElementNum(input_desc);
-  const size_t dispatch_element_num = mluOpGetTensorElementNum(dispatch_desc);
-  const size_t gates_element_num = mluOpGetTensorElementNum(gates_desc);
-
-  // check large tensor
-  TENSOR_NUM_CHECK(op_name, indices_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(op_name, locations_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(op_name, input_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(op_name, dispatch_element_num, LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK(op_name, gates_element_num, LARGE_TENSOR_NUM, "");
-
-  // check element num zero
-  if (indices_element_num == 0 || locations_element_num == 0 ||
-      input_element_num == 0 || dispatch_element_num == 0 ||
-      gates_element_num == 0) {
-    *zero_element = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // input and output ptr check null
-  PARAM_CHECK(op_name, indices != NULL);
-  PARAM_CHECK(op_name, locations != NULL);
-  PARAM_CHECK(op_name, input != NULL);
-  PARAM_CHECK(op_name, dispatch != NULL);
-  PARAM_CHECK(op_name, gates != NULL);
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpMoeDispatchForward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t gates_desc,
-    const void *gates, const mluOpTensorDescriptor_t indices_desc,
-    const void *indices, const mluOpTensorDescriptor_t locations_desc,
-    const void *locations, const mluOpTensorDescriptor_t input_desc,
-    const void *input, const int samples, const int capacity, const int hidden,
-    const int num_experts, const mluOpTensorDescriptor_t dispatch_desc,
-    void *dispatch) {
-  // check params
-  bool zero_element = false;
-  mluOpStatus_t param_check = MoeDispatchForwardParamCheck(
-      "[mluOpMoeDispatchForward]", handle, gates_desc, gates, indices_desc,
-      indices, locations_desc, locations, input_desc, input, samples, capacity,
-      hidden, num_experts, dispatch_desc, dispatch, &zero_element);
-  if (param_check != MLUOP_STATUS_SUCCESS) {
-    return param_check;
-  }
-
-  // check zero element
-  if (zero_element == true) {
-    VLOG(5) << "[mluOpMoeDispatchForward] Skip zero element tensor.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("moe_dispatch_forward");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA(true, "gates", gates, gates_desc, 0, 1);
-    GEN_CASE_DATA_REAL(true, "indices", indices, indices_desc);
-    GEN_CASE_DATA_REAL(true, "locations", locations, locations_desc);
-    GEN_CASE_DATA(true, "input", input, input_desc, -100, 100);
-    GEN_CASE_DATA(true, "dispatch", dispatch, dispatch_desc, -100, 100);
-    GEN_CASE_OP_PARAM_SINGLE(0, "moe_dispatch_forward", "samples", samples);
-    GEN_CASE_OP_PARAM_SINGLE(1, "moe_dispatch_forward", "capacity", capacity);
-    GEN_CASE_OP_PARAM_SINGLE(2, "moe_dispatch_forward", "hidden", hidden);
-    GEN_CASE_OP_PARAM_SINGLE(3, "moe_dispatch_forward", "num_experts",
-                             num_experts);
-    GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-  }
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(handle, &k_dim, &k_type);
-  VLOG(5) << "Launch kernel mluOpMoeDispatchForward<<<Block" << k_dim.x << ", "
-          << k_dim.y << ", " << k_dim.z << ">>>";
-
-  mluOpDataType_t data_type = input_desc->dtype;
-  VLOG(5) << "[mluOpMoeDispatchForward] launch "
-             "KernelMoeDispatchForward";
-  CHECK_RETURN(
-      "[mluOpMoeDispatchForward]",
-      KernelMoeDispatchForward(k_dim, k_type, handle->queue, data_type, gates,
-                               indices, locations, input, samples, capacity,
-                               hidden, num_experts, dispatch));
-
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/moe_dispatch_forward/moe_dispatch_forward.h b/kernels/moe_dispatch_forward/moe_dispatch_forward.h
deleted file mode 100644
index b8bba1971..000000000
--- a/kernels/moe_dispatch_forward/moe_dispatch_forward.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MOE_DISPATCH_FORWARD_MOE_DISPATCH_FORWARD_H
-#define KERNELS_MOE_DISPATCH_FORWARD_MOE_DISPATCH_FORWARD_H
-
-#include "mlu_op.h"
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *gates, const void *indices,
-    const void *locations, const void *input, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *dispatch);
-
-#endif  // KERNELS_MOE_DISPATCH_FORWARD_MOE_DISPATCH_FORWARD_H
diff --git a/kernels/moe_dispatch_forward/moe_dispatch_forward_block.mlu b/kernels/moe_dispatch_forward/moe_dispatch_forward_block.mlu
deleted file mode 100644
index 8e3cb502b..000000000
--- a/kernels/moe_dispatch_forward/moe_dispatch_forward_block.mlu
+++ /dev/null
@@ -1,155 +0,0 @@
-
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subh_iterect to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "moe_dispatch_forward.h"
-
-#include "core/logging.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_global__ void MLUKernelMoeDispatchFwd(
-    const T *gates, const int *indices, const int *locations, const T *input,
-    const int samples, const int capacity, const int hidden,
-    const int num_experts, T *dispatch) {
-#if __BANG_ARCH__ >= 372
-  if (__is_mpu()) {
-    return;
-  }
-
-  int max_deal_h = (MAX_NRAM_SIZE - 4 * sizeof(int) - sizeof(T)) / (sizeof(T));
-  int hidden_per_task = hidden / taskDim;
-  int hidden_rem = hidden % taskDim;
-  hidden_per_task += (taskId < hidden_rem) ? 1 : 0;
-  int deal_h = 0;
-  int deal_s = 0;
-  if (hidden_per_task > max_deal_h) {
-    deal_h = max_deal_h;
-    deal_s = 1;
-  } else {
-    deal_h = hidden_per_task;
-    deal_s =
-        (MAX_NRAM_SIZE - deal_h * sizeof(T)) / (4 * sizeof(int) + sizeof(T));
-    deal_s = deal_s < samples ? deal_s : samples;
-  }
-
-  // | nram space partion       | data num |
-  // | ------------------------ | -------- |
-  // | nram_input               |  deal_h  |
-  // | nram_gates               |  deal_s  |
-  // | nram_indices             |  deal_s  |
-  // | nram_location            |  deal_s  |
-  // | nram_idx                 |  deal_s  |
-  // | nram_mask                |  deal_s  |
-
-  T *nram_input = (T *)nram_buffer;
-  T *nram_gates = nram_input + deal_h;
-  int *nram_indices = (int *)nram_gates + deal_s;
-  int *nram_locations = nram_indices + deal_s;
-  int *nram_idx = nram_locations + deal_s;
-  int *nram_mask = nram_idx + deal_s;
-
-  int repeat_s = samples / deal_s;
-  int rem_s = samples % deal_s;
-  int repeat_h = hidden_per_task / deal_h;
-  int rem_h = hidden_per_task % deal_h;
-
-  for (int s_iter = 0; s_iter <= repeat_s; ++s_iter) {
-    int deal_s_num = (s_iter == repeat_s) ? rem_s : deal_s;
-    if (deal_s_num == 0) {
-      break;
-    }
-
-    // load gates indices locations
-    T *base_gates = (T *)gates + s_iter * deal_s_num;
-    int *base_indices = (int *)indices + s_iter * deal_s_num;
-    int *base_locations = (int *)locations + s_iter * deal_s_num;
-
-    __memcpy(nram_gates, base_gates, deal_s_num * sizeof(T), GDRAM2NRAM);
-    __memcpy(nram_indices, base_indices, deal_s_num * sizeof(int), GDRAM2NRAM);
-    __memcpy(nram_locations, base_locations, deal_s_num * sizeof(int),
-             GDRAM2NRAM);
-
-    // compute dispatch idx = (nram_indices * capacity + nram_locations)
-    __bang_mul_scalar(nram_idx, nram_indices, capacity, deal_s_num);
-    __bang_add(nram_idx, nram_idx, nram_locations, deal_s_num);
-
-    // 0 <= nram_locations < capacity
-    __bang_ge_scalar(nram_mask, nram_locations, (int)0, deal_s_num);
-    __bang_lt_scalar(nram_locations, nram_locations, capacity, deal_s_num);
-    __bang_and(nram_locations, nram_locations, nram_mask, deal_s_num);
-
-    // 0 <= nram_indices < num_experts
-    __bang_ge_scalar(nram_mask, nram_indices, (int)0, deal_s_num);
-    __bang_lt_scalar(nram_indices, nram_indices, num_experts, deal_s_num);
-    __bang_and(nram_indices, nram_indices, nram_mask, deal_s_num);
-    __bang_and(nram_mask, nram_indices, nram_locations, deal_s_num);
-
-    T *base_input = (T *)input + s_iter * deal_s_num * hidden;
-    for (int ds_iter = 0; ds_iter < deal_s_num; ++ds_iter) {
-      if (nram_mask[ds_iter] == 1) {
-        T *base_input_s = base_input + ds_iter * hidden;
-        T *base_dispatch_s = dispatch + nram_idx[ds_iter] * hidden;
-
-        for (int h_iter = 0; h_iter <= repeat_h; ++h_iter) {
-          int deal_h_num = (h_iter == repeat_h) ? rem_h : deal_h;
-          if (deal_h_num == 0) {
-            break;
-          }
-          int input_rem_num = (taskId < hidden_rem ? taskId : hidden_rem);
-          int input_offset = (hidden / taskDim) * taskId + input_rem_num;
-          T *base_input_h = base_input_s + input_offset + h_iter * deal_h;
-          T *base_dispatch_h = base_dispatch_s + input_offset + h_iter * deal_h;
-          __memcpy(nram_input, base_input_h, deal_h_num * sizeof(T),
-                   GDRAM2NRAM);
-
-          // dispatch = input * gates
-          __bang_mul_scalar(nram_input, nram_input, nram_gates[ds_iter],
-                            deal_h_num);
-
-          // store dispatch to GDRAM
-          __memcpy(base_dispatch_h, nram_input, deal_h_num * sizeof(T),
-                   NRAM2GDRAM);
-        }  // repeat h
-      }
-    }  // deal s
-  }    // repeat s
-#endif
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMoeDispatchForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    mluOpDataType_t d_type, const void *gates, const void *indices,
-    const void *locations, const void *input, const int samples,
-    const int capacity, const int hidden, const int num_experts,
-    void *dispatch) {
-  /* Only float data type is supported in host-side CPP file
-       fool-proof processing.*/
-  KERNEL_CHECK(MLUKernelMoeDispatchFwd<<<k_dim, k_type, queue>>>(
-      (float *)gates, (int *)indices, (int *)locations, (float *)input, samples,
-      capacity, hidden, num_experts, (float *)dispatch));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward.cpp b/kernels/ms_deform_attn_backward/ms_deform_attn_backward.cpp
deleted file mode 100644
index 0bb07a68a..000000000
--- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward.cpp
+++ /dev/null
@@ -1,446 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "ms_deform_attn_backward.h"
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/tool.h"
-#include "core/type.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-
-char API[] = "[mluOpMsDeformAttnBackward]";
-
-#define MIN(a, b) (((a) < (b)) ? (a) : (b))
-
-/*!
- * @brief Describes the kernel policy of ms_deform_attn_backward.
- */
-typedef enum {
-  MLUOP_MS_DEFORM_ATTN_BACKWARD_DEFAULT = 0,
-  /*!< Returns the default policy. */
-  MLUOP_MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL = 1,
-  /*!< Returns the small channel policy. */
-  MLUOP_MS_DEFORM_ATTN_BACKWARD_FAST = 2,
-  /*!< Returns the fast policy. */
-} mluOpDeformAttnBackwardKernelPolicy_t;
-
-static void policyFunc(mluOpHandle_t handle, const int32_t batch,
-                       const int32_t num_query, const int32_t num_heads,
-                       const int32_t num_levels, cnrtFunctionType_t *k_type,
-                       cnrtDim3_t *k_dim,
-                       mluOpDeformAttnBackwardKernelPolicy_t kernelPolicy) {
-  size_t cluster_limit = mluop::runtime::getClusterLimitCapability(handle);
-  size_t core_limit = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  k_dim->x = core_limit;
-  int32_t total_num = batch * num_query * num_heads * num_levels;
-  if (kernelPolicy == MLUOP_MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL) {
-    total_num = batch * num_query;
-  }
-  size_t total_num_align = CEIL_ALIGN(total_num, core_limit);
-  k_dim->y = (total_num_align / core_limit) > cluster_limit
-                 ? cluster_limit
-                 : (total_num_align / core_limit);
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-}
-
-mluOpDeformAttnBackwardKernelPolicy_t msDeformAttnBackwardPolicyFunc(
-    const mluOpHandle_t handle, const int channels, const int num_levels,
-    const int num_points, const int num_heads) {
-  const int num_hlp = num_heads * num_levels * num_points;
-  int num_per_time_theory = (MAX_NRAM_SIZE - num_levels * sizeof(float) -
-                             3 * num_levels * sizeof(int32_t)) /
-                            sizeof(float) / (8 * PAD_UP(channels, 32) + 28) /
-                            PAD_UP((num_hlp), 32);
-  int32_t nlp = num_levels * num_points;
-  int32_t nlpc = num_levels * num_points * channels;
-
-  if ((handle->arch == MLUOP_MLU590) && (nlp <= FAST_KERNEL_MAX_NLP) &&
-      (nlpc <= FAST_KERNEL_MAX_NLPC)) {
-    return MLUOP_MS_DEFORM_ATTN_BACKWARD_FAST;
-  } else if (num_per_time_theory >= 1) {
-    return MLUOP_MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL;
-  }
-  return MLUOP_MS_DEFORM_ATTN_BACKWARD_DEFAULT;
-}
-
-/* check user entrance param in mluOpMsDeformAttnBackward */
-static mluOpStatus_t msDeformAttnBackwardParamCheck(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t value_desc,
-    const void *value, const mluOpTensorDescriptor_t spatial_shapes_desc,
-    const void *spatial_shapes,
-    const mluOpTensorDescriptor_t level_start_index_desc,
-    const void *level_start_index,
-    const mluOpTensorDescriptor_t sampling_loc_desc, const void *sampling_loc,
-    const mluOpTensorDescriptor_t attn_weight_desc, const void *attn_weight,
-    const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output,
-    const int32_t im2col_step, const mluOpTensorDescriptor_t grad_value_desc,
-    void *grad_value, const mluOpTensorDescriptor_t grad_sampling_loc_desc,
-    void *grad_sampling_loc,
-    const mluOpTensorDescriptor_t grad_attn_weight_desc, void *grad_attn_weight,
-    bool *calc_grad_loc_weight_flag, bool *calc_grad_value_flag,
-    bool *calc_grad_value_loc_weight_flag) {
-  // check desc
-  PARAM_CHECK(API, handle != NULL);
-  PARAM_CHECK(API, value_desc != NULL);
-  PARAM_CHECK(API, spatial_shapes_desc != NULL);
-  PARAM_CHECK(API, level_start_index_desc != NULL);
-  PARAM_CHECK(API, sampling_loc_desc != NULL);
-  PARAM_CHECK(API, attn_weight_desc != NULL);
-  PARAM_CHECK(API, grad_output_desc != NULL);
-  PARAM_CHECK(API, grad_value_desc != NULL);
-  PARAM_CHECK(API, grad_sampling_loc_desc != NULL);
-  PARAM_CHECK(API, grad_attn_weight_desc != NULL);
-
-  // check dim
-  PARAM_CHECK(API, value_desc->dim == 4);
-  PARAM_CHECK(API, spatial_shapes_desc->dim == 2);
-  PARAM_CHECK(API, level_start_index_desc->dim == 1);
-  PARAM_CHECK(API, sampling_loc_desc->dim == 6);
-  PARAM_CHECK(API, attn_weight_desc->dim == 5);
-  PARAM_CHECK(API, grad_output_desc->dim == 4);
-  PARAM_CHECK(API, grad_value_desc->dim == 4);
-  PARAM_CHECK(API, grad_sampling_loc_desc->dim == 6);
-  PARAM_CHECK(API, grad_attn_weight_desc->dim == 5);
-
-  // check datatype
-  PARAM_CHECK(API, (value_desc->dtype == MLUOP_DTYPE_FLOAT &&
-                    spatial_shapes_desc->dtype == MLUOP_DTYPE_INT32 &&
-                    level_start_index_desc->dtype == MLUOP_DTYPE_INT32 &&
-                    sampling_loc_desc->dtype == MLUOP_DTYPE_FLOAT &&
-                    attn_weight_desc->dtype == MLUOP_DTYPE_FLOAT &&
-                    grad_output_desc->dtype == MLUOP_DTYPE_FLOAT &&
-                    grad_value_desc->dtype == MLUOP_DTYPE_FLOAT &&
-                    grad_sampling_loc_desc->dtype == MLUOP_DTYPE_FLOAT &&
-                    grad_attn_weight_desc->dtype == MLUOP_DTYPE_FLOAT));
-
-  const int32_t num_key = value_desc->dims[1];
-  const int32_t channels = value_desc->dims[3];
-  const int32_t batch = attn_weight_desc->dims[0];
-  const int32_t num_query = attn_weight_desc->dims[1];
-  const int32_t num_heads = attn_weight_desc->dims[2];
-  const int32_t num_levels = attn_weight_desc->dims[3];
-  const int32_t num_points = attn_weight_desc->dims[4];
-  // check input param
-  const int32_t im2col_step_ = MIN(batch, im2col_step);
-  PARAM_CHECK(API, im2col_step_ > 0);
-  PARAM_CHECK(API, batch % im2col_step_ == 0);
-
-  // check all the input relationship
-  for (int32_t i = 0; i < value_desc->dim; ++i) {
-    if (value_desc->dims[i] != grad_value_desc->dims[i]) {
-      LOG(ERROR) << "[mluOpMsDeformAttnBackward] The shape of value should be "
-                    "the same as grad_value."
-                 << " But now value_desc->dims[" << i << "] is "
-                 << value_desc->dims[i] << ", and grad_value_desc->dims[" << i
-                 << "] is " << grad_value_desc->dims[i] << ".";
-      return MLUOP_STATUS_BAD_PARAM;
-    }
-  }
-  for (int32_t i = 0; i < sampling_loc_desc->dim; ++i) {
-    if (sampling_loc_desc->dims[i] != grad_sampling_loc_desc->dims[i]) {
-      LOG(ERROR) << "[mluOpMsDeformAttnBackward] The shape of "
-                    "sampling_loc_desc should be the "
-                    "same as grad_sampling_loc_desc."
-                 << " But now sampling_loc_desc->dims[" << i << "] is "
-                 << sampling_loc_desc->dims[i]
-                 << ", and grad_sampling_loc_desc->dims[" << i << "] is "
-                 << grad_sampling_loc_desc->dims[i] << ".";
-      return MLUOP_STATUS_BAD_PARAM;
-    }
-  }
-  for (int32_t i = 0; i < attn_weight_desc->dim; ++i) {
-    if (attn_weight_desc->dims[i] != grad_attn_weight_desc->dims[i]) {
-      LOG(ERROR) << "[mluOpMsDeformAttnBackward] The shape of "
-                    "attn_weight_desc should be the "
-                    "same as grad_attn_weight_desc."
-                 << " But now attn_weight_desc->dims[" << i << "] is "
-                 << attn_weight_desc->dims[i]
-                 << ", and grad_attn_weight_desc->dims[" << i << "] is "
-                 << grad_attn_weight_desc->dims[i] << ".";
-      return MLUOP_STATUS_BAD_PARAM;
-    }
-  }
-  PARAM_CHECK_EQ(API, value_desc->dims[0], batch);
-  PARAM_CHECK_EQ(API, value_desc->dims[2], num_heads);
-
-  PARAM_CHECK_EQ(API, spatial_shapes_desc->dims[0], num_levels);
-  PARAM_CHECK_EQ(API, spatial_shapes_desc->dims[1], 2);
-
-  PARAM_CHECK_EQ(API, level_start_index_desc->dims[0], num_levels);
-
-  PARAM_CHECK_EQ(API, sampling_loc_desc->dims[0], batch);
-  PARAM_CHECK_EQ(API, sampling_loc_desc->dims[1], num_query);
-  PARAM_CHECK_EQ(API, sampling_loc_desc->dims[2], num_heads);
-  PARAM_CHECK_EQ(API, sampling_loc_desc->dims[3], num_levels);
-  PARAM_CHECK_EQ(API, sampling_loc_desc->dims[4], num_points);
-  PARAM_CHECK_EQ(API, sampling_loc_desc->dims[5], 2);
-
-  PARAM_CHECK_EQ(API, grad_output_desc->dims[0], batch);
-  PARAM_CHECK_EQ(API, grad_output_desc->dims[1], num_query);
-  PARAM_CHECK_EQ(API, grad_output_desc->dims[2], num_heads);
-  PARAM_CHECK_EQ(API, grad_output_desc->dims[3], channels);
-
-  TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(value_desc), LARGE_TENSOR_NUM,
-                   "");
-  TENSOR_NUM_CHECK(API, mluOpGetTensorElementNum(sampling_loc_desc),
-                   LARGE_TENSOR_NUM, "");
-
-  // check zero
-  if (batch * channels * num_heads * num_query == 0) {
-    LOG(ERROR) << "[mluOpMsDeformAttnBackward] The batch, channels, num_key, "
-                  "num_heads or "
-                  "num_query of the input is zero.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if ((num_levels == 0) || ((num_points == 0) && num_key == 0)) {
-    *calc_grad_value_loc_weight_flag = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if ((num_points == 0) && (num_key != 0)) {
-    *calc_grad_loc_weight_flag = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if ((num_key == 0) && (num_points != 0)) {
-    *calc_grad_value_flag = true;
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  PARAM_CHECK(API, value != NULL);
-  PARAM_CHECK(API, spatial_shapes != NULL);
-  PARAM_CHECK(API, level_start_index != NULL);
-  PARAM_CHECK(API, sampling_loc != NULL);
-  PARAM_CHECK(API, attn_weight != NULL);
-  PARAM_CHECK(API, grad_output != NULL);
-  PARAM_CHECK(API, grad_value != NULL);
-  PARAM_CHECK(API, grad_sampling_loc != NULL);
-  PARAM_CHECK(API, grad_attn_weight != NULL);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpMsDeformAttnBackward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t value_desc,
-    const void *value, const mluOpTensorDescriptor_t spatial_shapes_desc,
-    const void *spatial_shapes,
-    const mluOpTensorDescriptor_t level_start_index_desc,
-    const void *level_start_index,
-    const mluOpTensorDescriptor_t sampling_loc_desc, const void *sampling_loc,
-    const mluOpTensorDescriptor_t attn_weight_desc, const void *attn_weight,
-    const mluOpTensorDescriptor_t grad_output_desc, const void *grad_output,
-    const int32_t im2col_step, const mluOpTensorDescriptor_t grad_value_desc,
-    void *grad_value, const mluOpTensorDescriptor_t grad_sampling_loc_desc,
-    void *grad_sampling_loc,
-    const mluOpTensorDescriptor_t grad_attn_weight_desc,
-    void *grad_attn_weight) {
-  // entrance param check
-  bool calc_grad_value_flag = false;
-  bool calc_grad_loc_weight_flag = false;
-  bool calc_grad_value_loc_weight_flag = false;
-  mluOpStatus_t param_check_status = msDeformAttnBackwardParamCheck(
-      handle, value_desc, value, spatial_shapes_desc, spatial_shapes,
-      level_start_index_desc, level_start_index, sampling_loc_desc,
-      sampling_loc, attn_weight_desc, attn_weight, grad_output_desc,
-      grad_output, im2col_step, grad_value_desc, grad_value,
-      grad_sampling_loc_desc, grad_sampling_loc, grad_attn_weight_desc,
-      grad_attn_weight, &calc_grad_loc_weight_flag, &calc_grad_value_flag,
-      &calc_grad_value_loc_weight_flag);
-
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("ms_deform_attn_backward");
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA_REAL(true, "value", value, value_desc);
-    GEN_CASE_DATA_REAL(true, "spatial_shapes", spatial_shapes,
-                       spatial_shapes_desc);
-    GEN_CASE_DATA_REAL(true, "level_start_index", level_start_index,
-                       level_start_index_desc);
-    GEN_CASE_DATA_REAL(true, "sampling_loc", sampling_loc, sampling_loc_desc);
-    GEN_CASE_DATA_REAL(true, "attn_weight", attn_weight, attn_weight_desc);
-    GEN_CASE_DATA_REAL(true, "grad_output", grad_output, grad_output_desc);
-    GEN_CASE_DATA(false, "grad_value", grad_value, grad_value_desc, 0, 0);
-    GEN_CASE_DATA(false, "grad_sampling_loc", grad_sampling_loc,
-                  grad_sampling_loc_desc, 0, 0);
-    GEN_CASE_DATA(false, "grad_attn_weight", grad_attn_weight,
-                  grad_attn_weight_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "ms_deform_attn_backward", "im2col_step",
-                             im2col_step);
-    GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-  }
-  if (MLUOP_STATUS_SUCCESS != param_check_status) {
-    return param_check_status;
-  }
-
-  if (calc_grad_loc_weight_flag) {
-    uint64_t fill_value = 0x0;
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_value_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, grad_value));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-    GEN_CASE_END();
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if (calc_grad_value_flag) {
-    uint64_t fill_value = 0x0;
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_sampling_loc_desc,
-                                                   cnnl_output_desc);
-      CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                            cnnl_output_desc, grad_sampling_loc));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-
-    {
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_attn_weight_desc,
-                                                   cnnl_output_desc);
-      CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                            cnnl_output_desc, grad_attn_weight));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    GEN_CASE_END();
-    return MLUOP_STATUS_SUCCESS;
-  }
-  if (calc_grad_value_loc_weight_flag) {
-    GEN_CASE_END();
-    return MLUOP_STATUS_SUCCESS;
-  }
-  VLOG(5) << "[mluOpMsDeformAttnBackward] cnnlFill_v3 start.";
-  uint64_t fill_value = 0x0;
-
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_value_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, grad_value));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_sampling_loc_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, grad_sampling_loc));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-
-  {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grad_attn_weight_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, grad_attn_weight));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-
-  VLOG(5) << "[mluOpMsDeformAttnBackward] cnnlFill_v3 end.";
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  const int32_t spatial_size = value_desc->dims[1];
-  const int32_t batch = attn_weight_desc->dims[0];
-  const int32_t channels = value_desc->dims[3];
-  const int32_t num_query = attn_weight_desc->dims[1];
-  const int32_t num_heads = attn_weight_desc->dims[2];
-  const int32_t num_levels = attn_weight_desc->dims[3];
-  const int32_t num_points = attn_weight_desc->dims[4];
-  // generate mluOpMsDeformAttnBackward prototxt start!
-
-  VLOG(5) << "[mluOpMsDeformAttnBackward]        batch: " << batch;
-  VLOG(5) << "[mluOpMsDeformAttnBackward]     channels: " << channels;
-  VLOG(5) << "[mluOpMsDeformAttnBackward]    num_query: " << num_query;
-  VLOG(5) << "[mluOpMsDeformAttnBackward]    num_heads: " << num_heads;
-  VLOG(5) << "[mluOpMsDeformAttnBackward]   num_levels: " << num_levels;
-  VLOG(5) << "[mluOpMsDeformAttnBackward]   num_points: " << num_points;
-  VLOG(5) << "[mluOpMsDeformAttnBackward] spatial_size: " << spatial_size;
-
-  mluOpDeformAttnBackwardKernelPolicy_t kernelPolicy =
-      msDeformAttnBackwardPolicyFunc(handle, channels, num_levels, num_points,
-                                     num_heads);
-
-  policyFunc(handle, batch, num_query, num_heads, num_levels, &k_type, &k_dim,
-             kernelPolicy);
-  switch (kernelPolicy) {
-    case MLUOP_MS_DEFORM_ATTN_BACKWARD_FAST: {
-      VLOG(5) << "Launch Kernel MsDeformAttnBackwardFast<<<Union"
-              << k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", "
-              << k_dim.z << ">>>";
-      CHECK_RETURN(
-          "[MsDeformAttnBackwardFast]",
-          KernelMsDeformAttnBackwardFast(
-              k_dim, k_type, handle->queue, (float *)value,
-              (int32_t *)spatial_shapes, (int32_t *)level_start_index,
-              (float *)sampling_loc, (float *)attn_weight, (float *)grad_output,
-              batch, spatial_size, num_heads, channels, num_levels, num_query,
-              num_points, (float *)grad_value, (float *)grad_sampling_loc,
-              (float *)grad_attn_weight));
-    } break;
-    case MLUOP_MS_DEFORM_ATTN_BACKWARD_DEFAULT: {
-      VLOG(5) << "Launch Kernel MsDeformAttnBackwardDefault<<<Union"
-              << k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", "
-              << k_dim.z << ">>>";
-      CHECK_RETURN(
-          "[MsDeformAttnBackwardDefault]",
-          KernelMsDeformAttnBackwardDefault(
-              k_dim, k_type, handle->queue, (float *)value,
-              (int32_t *)spatial_shapes, (int32_t *)level_start_index,
-              (float *)sampling_loc, (float *)attn_weight, (float *)grad_output,
-              batch, spatial_size, num_heads, channels, num_levels, num_query,
-              num_points, (float *)grad_value, (float *)grad_sampling_loc,
-              (float *)grad_attn_weight));
-    } break;
-    case MLUOP_MS_DEFORM_ATTN_BACKWARD_SMALL_CHANNEL: {
-      VLOG(5) << "Launch Kernel MsDeformAttnBackwardSmallChannels<<<Union"
-              << k_type / CORE_DIM << ", " << k_dim.x << ", " << k_dim.y << ", "
-              << k_dim.z << ">>>";
-      CHECK_RETURN(
-          "[MsDeformAttnBackwardSmallChannels]",
-          KernelMsDeformAttnBackwardSmallChannels(
-              k_dim, k_type, handle->queue, (float *)value,
-              (int32_t *)spatial_shapes, (int32_t *)level_start_index,
-              (float *)sampling_loc, (float *)attn_weight, (float *)grad_output,
-              batch, spatial_size, num_heads, channels, num_levels, num_query,
-              num_points, (float *)grad_value, (float *)grad_sampling_loc,
-              (float *)grad_attn_weight));
-    }
-    default: {
-      VLOG(5) << "Not Implemented.";
-    }
-  }
-
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward.h b/kernels/ms_deform_attn_backward/ms_deform_attn_backward.h
deleted file mode 100644
index 64237ff6d..000000000
--- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MS_DEFORM_ATTN_BACKWARD_MS_DEFORM_ATTN_BACKWARD_H
-#define KERNELS_MS_DEFORM_ATTN_BACKWARD_MS_DEFORM_ATTN_BACKWARD_H
-
-#include "mlu_op.h"
-
-#define FAST_KERNEL_MAX_NLP (128)
-#define FAST_KERNEL_MAX_NLPC (16384)
-
-mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardSmallChannels(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight);
-
-mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardDefault(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight);
-
-mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardFast(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight);
-
-#endif  // KERNELS_MS_DEFORM_ATTN_BACKWARD_MS_DEFORM_ATTN_BACKWARD_H
diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu b/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu
deleted file mode 100644
index 7b4a97527..000000000
--- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu
+++ /dev/null
@@ -1,626 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "kernels/ms_deform_attn_backward/ms_deform_attn_backward.h"
-#include "kernels/ms_deform_attn_forward/ms_deform_attn_utils.h"
-
-#include "core/logging.h"
-
-#if (__BANG_ARCH__ == 592)
-
-#define MAX_MEMCPY_SEGNUM (65536)
-#define NRAM_REMAIN_SIZE (48 * 1024)
-#define SRAM_REMAIN_SIZE (32 * 1024)
-#define NRAM_AVALIABLE_SIZE (__MLU_NRAM_SIZE__ * 1024 - NRAM_REMAIN_SIZE)
-#define WRAM_AVALIABLE_SIZE (__MLU_WRAM_SIZE__ * 1024)
-#define SRAM_AVALIABLE_SIZE (__MLU_SRAM_SIZE__ * 1024 - SRAM_REMAIN_SIZE)
-
-__nram__ char nram_buffer[NRAM_AVALIABLE_SIZE];
-__mlu_shared__ char sram_buffer[SRAM_AVALIABLE_SIZE];
-__wram__ char wram_buffer[WRAM_AVALIABLE_SIZE];
-
-__mlu_func__ void loadNram2Gpr(int32_t& v1, int32_t& v2, int32_t& v3,
-                               int32_t& v4, const int32_t* p1,
-                               const int32_t* p2, const int32_t* p3,
-                               const int32_t* p4) {
-  v1 = __load_nram(p1);
-  v2 = __load_nram(p2);
-  v3 = __load_nram(p3);
-  v4 = __load_nram(p4);
-}
-
-template <typename T>
-__mlu_func__ void memPolicyBackward(
-    int32_t*& seq_nram, T*& zeros_nram, int32_t*& data_offset_nram,
-    T*& weight_polation_nram, T*& cond_point_polation_nram,
-    T*& cond_point_valid_nram, T*& delta_xy_nram, T*& loc_nram, T*& buf_nram,
-    T*& buf_nram_end, int8_t*& mask_x_nram, int8_t*& mask_y_nram,
-    T*& spatial_offset_bd_nram, T*& spatial_w_bd_nram, T*& spatial_h_bd_nram,
-    int32_t*& spatial_offset_nram, int32_t*& spatial_hw_nram,
-    T*& compute_buffer,  // (5, deal_n, num_levels, num_points, channels)
-    T*& weight_polation_nram_stg2, T*& weight_attn_nram_stg2,
-    int32_t*& offset_nram_stg2, T*& grad_output_nram,
-    int8_t*& bit_cond_nram,          // (4, pad_points / 8)
-    int8_t*& bit_cond_reverse_nram,  // (4, pad_points / 8)
-    T*& cond_nram_stg2,
-    T*& compute_buffer_nram_stg3,  // (4, max_deal_n, num_levels, num_points)
-    T*& delta_xy_nram_stg3,        // (4, max_deal_n, num_levels, num_points)
-    T*& grad_wp_nram_stg3,         // (4, total_deal_n, num_levels, num_points)
-    int32_t*& data_offset_sram, T*& weight_polation_sram, T*& grad_wp_sram,
-    T*& weight_attn_sram, T*& cond_point_polation_sram, T*& delta_xy_sram,
-    char* nram_buffer, char* sram_buffer, int32_t& max_cached_n,
-    int32_t& stage_1_max_deal_n, int32_t& stage_2_max_deal_n,
-    int32_t& stage_3_max_deal_n, int32_t& mask_size,
-    const int32_t nram_avaliable_size, const int32_t sram_avaliable_size,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points) {
-  const int32_t num_points_levels = num_levels * num_points;
-  const int32_t spatial_info_size =
-      PAD_UP(3 * num_levels * sizeof(int32_t), WRAM_ALIGN_SIZE);
-  const int32_t spatial_info_bd_size =
-      PAD_UP(3 * num_points_levels * sizeof(T), WRAM_ALIGN_SIZE);
-  const int32_t zeros_size = PAD_UP(channels * sizeof(T), WRAM_ALIGN_SIZE);
-  const int32_t seq_size = BACKWARD_MAX_NQ_NL_NP * sizeof(int32_t);
-  const int32_t fix_space_size = spatial_info_size +
-                                 2 * BIT_COLLECT_PAD * sizeof(T) +
-                                 spatial_info_bd_size + zeros_size + seq_size;
-  const int32_t left_space_size = nram_avaliable_size - fix_space_size;
-  stage_1_max_deal_n = left_space_size / (24 * num_points_levels * sizeof(T));
-  const int32_t total_points = stage_1_max_deal_n * num_points_levels;
-  const int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD);
-  mask_size = PAD_UP(total_coord_pad / BIT_COLLECT_PAD, WRAM_ALIGN_SIZE);
-  stage_2_max_deal_n =
-      (left_space_size - 2 * mask_size - 7 * WRAM_ALIGN_SIZE) /
-      ((5 * num_points_levels * channels + 20 * num_points_levels + channels) *
-       sizeof(T));
-  stage_2_max_deal_n =
-      std::min(BACKWARD_MAX_NQ_NL_NP / num_points_levels, stage_2_max_deal_n);
-  stage_3_max_deal_n = (left_space_size - 2 * mask_size - 2 * WRAM_ALIGN_SIZE) /
-                       (12 * num_points_levels * sizeof(T));
-  // fix nram space
-  seq_nram = (int32_t*)(nram_buffer);
-  zeros_nram = (T*)(seq_nram + seq_size / sizeof(int32_t));
-  spatial_offset_nram = (int32_t*)(zeros_nram + zeros_size / sizeof(T));
-  spatial_hw_nram = spatial_offset_nram + num_levels;
-  spatial_offset_bd_nram =
-      (T*)((int8_t*)spatial_offset_nram + spatial_info_size);
-  spatial_w_bd_nram = spatial_offset_bd_nram + num_points_levels;
-  spatial_h_bd_nram = spatial_w_bd_nram + num_points_levels;
-  mask_x_nram = (int8_t*)spatial_offset_bd_nram + spatial_info_bd_size;
-  mask_y_nram = mask_x_nram + mask_size;
-  // stage1 nram space
-  // 4 + 4 + 4 + 4 + 1 + 6
-  data_offset_nram = (int32_t*)(mask_y_nram + mask_size);
-  delta_xy_nram = (T*)(data_offset_nram + 4 * total_points);
-  weight_polation_nram = delta_xy_nram + 4 * total_points;
-  cond_point_polation_nram = weight_polation_nram + 4 * total_points;
-  cond_point_valid_nram = cond_point_polation_nram + 4 * total_points;
-  buf_nram = cond_point_valid_nram + total_points;
-  loc_nram = buf_nram + 4 * total_points;
-  buf_nram_end = buf_nram + 6 * total_points + total_coord_pad;
-  // stage2 nram space
-  const int32_t total_points_stg2 = stage_2_max_deal_n * num_points_levels;
-  const int32_t compute_buffer_size_pad =
-      5 * PAD_UP(total_points_stg2 * channels * sizeof(T), WRAM_ALIGN_SIZE);
-  const int32_t bit_cond_pad_size =
-      PAD_UP(PAD_UP(total_points_stg2, BIT_COLLECT_PAD) / BIT_COLLECT_PAD * 5,
-             WRAM_ALIGN_SIZE);
-  cond_nram_stg2 = (T*)(mask_y_nram + mask_size);
-  bit_cond_nram = (int8_t*)cond_nram_stg2 +
-                  PAD_UP(5 * total_points_stg2 * sizeof(T), WRAM_ALIGN_SIZE);
-  bit_cond_reverse_nram = bit_cond_nram + bit_cond_pad_size;
-  compute_buffer = (T*)(bit_cond_reverse_nram + bit_cond_pad_size);
-  grad_output_nram = compute_buffer + compute_buffer_size_pad / sizeof(T);
-  weight_polation_nram_stg2 = grad_output_nram + stage_2_max_deal_n * channels;
-  weight_attn_nram_stg2 = weight_polation_nram_stg2 + 4 * total_points_stg2;
-  offset_nram_stg2 = (int32_t*)(weight_attn_nram_stg2 + total_points_stg2);
-  // stage3 nram space
-  const int32_t total_points_stg3 = stage_3_max_deal_n * num_points_levels;
-  compute_buffer_nram_stg3 = (T*)(mask_y_nram + mask_size);
-  delta_xy_nram_stg3 = compute_buffer_nram_stg3 + 4 * total_points_stg3;
-  grad_wp_nram_stg3 = delta_xy_nram_stg3 + 4 * total_points_stg3;
-  // sram space: 4 + 4 + 1 + 5 + 4
-  const int32_t polation_info_size = 18 * num_points_levels * sizeof(T);
-  const int32_t avg_sram_size = sram_avaliable_size / coreDim;
-  max_cached_n = avg_sram_size / polation_info_size;
-  const int32_t max_cached_points = max_cached_n * num_points_levels;
-  T* sram_buf_base = (T*)(sram_buffer + avg_sram_size * coreId);
-  data_offset_sram = (int32_t*)sram_buf_base;
-  weight_polation_sram = (T*)(data_offset_sram + 4 * max_cached_points);
-  weight_attn_sram = (T*)(weight_polation_sram + 4 * max_cached_points);
-  cond_point_polation_sram = (T*)(weight_attn_sram + max_cached_points);
-  delta_xy_sram = (T*)(cond_point_polation_sram + 5 * max_cached_points);
-  grad_wp_sram = weight_polation_sram;  // reuse
-}
-
-template <typename T>
-__mlu_func__ void backwardStageTwoLoop(
-    int32_t* seq_nram, T* compute_buffer_nram, T* zeros_nram,
-    T* weight_polation_nram, T* weight_attn_nram, int32_t* offset_nram,
-    T* cond_nram, int8_t* bit_cond_nram, int8_t* bit_cond_reverse_nram,
-    T* grad_output_nram, T* delta_xy_nram, int32_t* data_offset_sram,
-    T* weight_polation_sram, T* grad_wp_sram, T* weight_attn_sram,
-    T* cond_point_polation_sram, T* delta_xy_sram, T* data_value_gdram,
-    T* grad_output_gdram, T* grad_value_gdram, T* grad_attn_weight_gdram,
-    char* wram_buffer, const int32_t total_deal_n, const int32_t max_deal_n,
-    const int32_t input_stride_2, const int32_t input_stride_3,
-    const int32_t output_stride_2, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels,
-    const int32_t num_points) {
-  const int32_t num_levels_points = num_levels * num_points;
-  const int32_t loop_num = (total_deal_n + max_deal_n - 1) / max_deal_n;
-  int32_t* offset_zero_nram_stg2 =
-      offset_nram + 4 * max_deal_n * num_levels_points;
-  const int32_t src_stride = total_deal_n * num_levels_points * sizeof(T);
-  for (int i = 0; i < loop_num; i++) {
-    int32_t deal_n = std::min(total_deal_n - i * max_deal_n, max_deal_n);
-    int32_t copy_size_1 = deal_n * num_levels_points * sizeof(T);
-    int32_t copy_size_2 = deal_n * num_levels_points * sizeof(int32_t);
-    int32_t sram_src_offset = i * max_deal_n * num_levels_points;
-    int32_t nq_nl_np_c = deal_n * num_levels_points * channels;
-    int32_t nq_nl_np = deal_n * num_levels_points;
-    int32_t nq_nl_np_4 = 4 * deal_n * num_levels_points;
-
-    __memcpy_async(grad_output_nram,
-                   grad_output_gdram + i * max_deal_n * output_stride_2,
-                   channels * sizeof(T), GDRAM2NRAM, channels * sizeof(T),
-                   output_stride_2 * sizeof(T), deal_n - 1);
-    __memcpy_async(offset_nram, data_offset_sram + sram_src_offset, copy_size_2,
-                   SRAM2NRAM, copy_size_2, src_stride, 3);
-    __memcpy_async(cond_nram, cond_point_polation_sram + sram_src_offset,
-                   copy_size_1, SRAM2NRAM, copy_size_1, src_stride, 4);
-    __memcpy_async(weight_attn_nram, weight_attn_sram + sram_src_offset,
-                   copy_size_1, SRAM2NRAM);
-    __memcpy_async(weight_polation_nram, weight_polation_sram + sram_src_offset,
-                   copy_size_1, SRAM2NRAM, copy_size_1, src_stride, 3);
-    __bang_write_value(offset_zero_nram_stg2, nq_nl_np_4, (int32_t)0);
-    __sync_move();
-
-    T* tmp_zero = (T*)offset_zero_nram_stg2;
-    int32_t nq_nl_np_pad8 = PAD_UP(nq_nl_np, BIT_COLLECT_PAD);
-    int32_t bit_cond_stride = nq_nl_np_pad8 / BIT_COLLECT_PAD;
-    if (nq_nl_np_pad8 == nq_nl_np) {
-      int32_t bit_cond_stride_4 = 4 * bit_cond_stride;
-      __bang_gt_bitindex((T*)bit_cond_nram, cond_nram, tmp_zero, nq_nl_np_4);
-      __bang_bnot((char*)bit_cond_reverse_nram, (char*)bit_cond_nram,
-                  4 * bit_cond_stride);
-      __bang_gt_bitindex((T*)(bit_cond_nram + bit_cond_stride_4),
-                         cond_nram + nq_nl_np_4, tmp_zero, nq_nl_np);
-      __bang_bnot((char*)(bit_cond_reverse_nram + bit_cond_stride_4),
-                  (char*)(bit_cond_nram + bit_cond_stride_4), bit_cond_stride);
-    } else {
-      for (int j = 0; j < 5; j++) {
-        __bang_gt_bitindex((T*)((int8_t*)bit_cond_nram + j * bit_cond_stride),
-                           cond_nram + j * nq_nl_np, tmp_zero, nq_nl_np_pad8);
-        __bang_bnot((char*)bit_cond_reverse_nram + j * bit_cond_stride,
-                    (char*)bit_cond_nram + j * bit_cond_stride,
-                    bit_cond_stride);
-      }
-    }
-
-    __sync_io_move_compute();
-
-    int32_t buffer_size_pad = PAD_UP(nq_nl_np_c * sizeof(T), WRAM_ALIGN_SIZE);
-    int32_t buffer_data_num = buffer_size_pad / sizeof(T);
-    T* inter_grad = compute_buffer_nram;
-    T* v_ping = inter_grad + buffer_data_num;
-    T* v_pong = v_ping + buffer_data_num;
-    T* value_wp = v_pong + buffer_data_num;
-    T* buffer = value_wp + buffer_data_num;
-
-    for (int j = 0; j < 5; j++) {
-      T* tmp_wp = weight_polation_nram + (j - 1) * nq_nl_np;
-      if (j < 4) {
-        gatherAsync(v_ping, zeros_nram, (unsigned int*)offset_zero_nram_stg2,
-                    bit_cond_reverse_nram + j * bit_cond_stride,
-                    channels * sizeof(T), NRAM2NRAM, channels * sizeof(T),
-                    nq_nl_np);
-        gatherAsync(v_ping, data_value_gdram,
-                    (unsigned int*)offset_nram + j * nq_nl_np,
-                    bit_cond_nram + j * bit_cond_stride, channels * sizeof(T),
-                    GDRAM2NRAM, channels * sizeof(T), nq_nl_np);
-      }
-
-      if (j == 0) {
-        // (n, c) => (n, nl, np, c)
-        __memcpy_async(buffer, grad_output_nram, channels * sizeof(T),
-                       NRAM2NRAM, channels * sizeof(T), num_levels_points - 1,
-                       num_levels_points * channels * sizeof(T), deal_n - 1, 0,
-                       num_levels_points - 1, channels * sizeof(T), deal_n - 1);
-        gatherAsync(buffer, zeros_nram, (unsigned int*)offset_zero_nram_stg2,
-                    bit_cond_reverse_nram + 4 * bit_cond_stride,
-                    channels * sizeof(T), NRAM2NRAM, channels * sizeof(T),
-                    nq_nl_np);
-        __bang_write_value(value_wp, nq_nl_np_c, (T)0);  // clear value*wp
-        __sync_move();
-        // (n, nl, np, c) => (c, n, nl, np)
-        __bang_transpose(v_pong, buffer, nq_nl_np, channels);
-        __sync_compute();
-        // (c, n, nl, np) * (n, nl, np)
-        __bang_cycle_mul(inter_grad, v_pong, weight_attn_nram, nq_nl_np_c,
-                         nq_nl_np);
-        __memcpy_async(wram_buffer, v_pong, buffer_size_pad, NRAM2WRAM);
-      }
-
-      if (j == 4) {
-        __memcpy_async(v_ping, wram_buffer, buffer_size_pad, WRAM2NRAM);
-      }
-
-      if (j > 0) {
-        // (n, nl, np, c) => (c, n, nl, np)
-        __bang_transpose(buffer, v_pong, nq_nl_np, channels);
-        // (c, n, nl, np) * (n, nl, np)
-        __bang_cycle_mul(v_pong, buffer, tmp_wp, nq_nl_np_c, nq_nl_np);
-        __bang_add(value_wp, value_wp, v_pong, nq_nl_np_c);
-        __bang_mul(v_pong, buffer, inter_grad, nq_nl_np_c);
-        // (c, nq, nl, np) => (nq, nl, np)
-        __bang_sumpool(buffer, v_pong, nq_nl_np, channels, 1, channels, 1, 1,
-                       1);
-        __bang_float2int32((int32_t*)v_pong, cond_nram + (j - 1) * nq_nl_np,
-                           nq_nl_np, 0);
-        __bang_mul_scalar((int32_t*)v_pong, (int32_t*)v_pong,
-                          (int32_t)0xffffffff, nq_nl_np);
-        __bang_band((char*)buffer, (char*)buffer, (char*)v_pong,
-                    nq_nl_np * sizeof(T));
-        // (nq, nl, np) => (Nq, nl, np)
-        __sync_compute();
-        __memcpy_async(grad_wp_sram + sram_src_offset +
-                           (j - 1) * total_deal_n * num_levels_points,
-                       buffer, nq_nl_np * sizeof(T), NRAM2SRAM);
-      }
-
-      T* tmp = v_ping;
-      v_ping = v_pong;
-      v_pong = tmp;
-
-      __sync_io_move_compute();
-    }
-
-    // compute grad_attn_weight
-    T* grad_output_bd = v_pong;
-    __bang_mul(v_ping, value_wp, grad_output_bd, nq_nl_np_c);
-    // (c, nq, nl, np) => (nq, nl, np)
-    __bang_sumpool(buffer, v_ping, nq_nl_np, channels, 1, channels, 1, 1, 1);
-    __memcpy(grad_attn_weight_gdram + i * max_deal_n * input_stride_3, buffer,
-             num_levels_points * sizeof(T), NRAM2GDRAM,
-             input_stride_3 * sizeof(T), num_levels_points * sizeof(T),
-             deal_n - 1);
-
-    // compute grad_value
-    T* grad_value_buffer = inter_grad + buffer_data_num;
-    int32_t neighbor_order[4] = {1, 3, 0, 2};
-    for (int k = 0; k < 4; k++) {
-      int neighbor_idx = neighbor_order[k];
-      T* grad_value_tmp = k < 3 ? buffer : inter_grad;
-      __bang_cycle_mul(grad_value_tmp, inter_grad,
-                       weight_polation_nram + neighbor_idx * nq_nl_np,
-                       nq_nl_np_c, nq_nl_np);
-      __bang_transpose(grad_value_buffer + k * buffer_data_num, grad_value_tmp,
-                       channels, nq_nl_np);
-    }
-
-    // store all valid point
-    T* cond_all_valid = cond_nram + nq_nl_np_4;
-    __bang_and(cond_all_valid, cond_nram, cond_nram + nq_nl_np, nq_nl_np);
-    __bang_and(cond_all_valid, cond_all_valid, cond_nram + 2 * nq_nl_np,
-               nq_nl_np);
-    __bang_and(cond_all_valid, cond_all_valid, cond_nram + 3 * nq_nl_np,
-               nq_nl_np);
-    int32_t all_valid_count = __bang_sum(cond_all_valid, nq_nl_np);
-    int32_t* dst_offset = (int32_t*)offset_zero_nram_stg2;
-    for (int i = 0; i < 4; i++) {
-      __bang_collect((T*)dst_offset + i * nq_nl_np,
-                     (T*)offset_nram + i * nq_nl_np, cond_all_valid, nq_nl_np);
-    }
-    int32_t* src_offset = (int32_t*)inter_grad;
-    int32_t* stride_4_2 = dst_offset + 3 * nq_nl_np;
-    int32_t* stride_1_2 = dst_offset;
-    __bang_collect((T*)src_offset, (T*)seq_nram, cond_all_valid, nq_nl_np);
-    __bang_mul_scalar(src_offset, src_offset, channels * sizeof(T), nq_nl_np);
-    __bang_sub(stride_4_2, stride_4_2, dst_offset + nq_nl_np, nq_nl_np);
-    __bang_sub(stride_1_2, stride_1_2, dst_offset + nq_nl_np, nq_nl_np);
-    int src_stride_1 = buffer_size_pad;
-    int src_stride_2 = src_stride_1 * 2;
-    int32_t* dst_offset_base = dst_offset + nq_nl_np;
-    int32_t dst_offset_2, src_offset_2, dst_stride_4_2, dst_stride_1_2;
-    for (int s = 0; s < all_valid_count; s++) {
-      loadNram2Gpr(dst_offset_2, src_offset_2, dst_stride_4_2, dst_stride_1_2,
-                   dst_offset_base + s, src_offset + s, stride_4_2 + s,
-                   stride_1_2 + s);
-      __bang_atomic_reduce_add((T*)((int8_t*)grad_value_gdram + dst_offset_2),
-                               (T*)((int8_t*)grad_value_buffer + src_offset_2),
-                               channels, 1, 1, dst_stride_4_2, dst_stride_1_2,
-                               src_stride_1, src_stride_2);
-    }
-
-    // store partial valid point
-    __bang_not(cond_all_valid, cond_all_valid, nq_nl_np);
-    __bang_cycle_and(cond_nram, cond_nram, cond_all_valid, nq_nl_np_4,
-                     nq_nl_np);
-    for (int k = 0; k < 4; k++) {
-      int32_t offset = neighbor_order[k] * nq_nl_np;
-      T* grad_value_tmp = grad_value_buffer + k * buffer_data_num;
-      T* tmp_cond = cond_nram + offset;
-      int32_t* tmp_dst_offset = offset_nram + offset;
-      int32_t* tmp_src_offset = (int32_t*)inter_grad;
-      int32_t valid_count = __bang_sum(tmp_cond, nq_nl_np);
-      if (valid_count > 0) {
-        __bang_collect((T*)tmp_dst_offset, (T*)tmp_dst_offset, tmp_cond,
-                       nq_nl_np);
-        __bang_collect((T*)tmp_src_offset, (T*)seq_nram, tmp_cond, nq_nl_np);
-        __bang_mul_scalar(tmp_src_offset, tmp_src_offset, channels * sizeof(T),
-                          valid_count);
-        for (int p = 0; p < valid_count; p++) {
-          __bang_atomic_reduce_add(
-              (T*)((int8_t*)grad_value_gdram + tmp_dst_offset[p]),
-              (T*)((int8_t*)grad_value_tmp + tmp_src_offset[p]), channels);
-        }
-      }
-    }
-    __sync_io_move_compute();
-  }
-}
-
-template <typename T>
-__mlu_func__ void backwardStageThreeLoop(
-    T* compute_buffer_nram, T* delta_xy_nram, T* grad_wp_nram,
-    T* spatial_h_bd_nram, T* spatial_w_bd_nram, T* delta_xy_sram,
-    T* grad_wp_sram, T* grad_loc_gdram, const int32_t total_deal_n,
-    const int32_t max_deal_n, const int32_t input_stride_2,
-    const int32_t input_stride_3, const int32_t output_stride_2,
-    const int32_t num_heads, const int32_t channels, const int32_t num_levels,
-    const int32_t num_points) {
-  const int32_t loop_num = (total_deal_n + max_deal_n - 1) / max_deal_n;
-  const int32_t num_levels_points = num_levels * num_points;
-  const int32_t src_stride = total_deal_n * num_levels_points * sizeof(T);
-  /*
-    grad_dx    = (grad_w3-grad_w1)*dy + (grad_w4-grad_w2)*(1-dy)
-    grad_loc_x = grad_dx * W
-    grad_dy    = (grad_w3-grad_w4)*dx + (grad_w1-grad_w2)*(1-dx)
-    grad_loc_y = grad_dy * H
-  */
-  for (int i = 0; i < loop_num; i++) {
-    int32_t deal_n = std::min(total_deal_n - i * max_deal_n, max_deal_n);
-    int32_t copy_size = deal_n * num_levels_points * sizeof(T);
-    int32_t sram_src_offset = i * max_deal_n * num_levels_points;
-    int32_t nq_nl_np = deal_n * num_levels_points;
-    T* grad_wp_1 = grad_wp_nram;
-    T* grad_wp_2 = grad_wp_nram + nq_nl_np;
-    T* grad_wp_3 = grad_wp_nram + 2 * nq_nl_np;
-    T* grad_wp_4 = grad_wp_nram + 3 * nq_nl_np;
-    T* dx = delta_xy_nram;
-    T* dx_1 = delta_xy_nram + nq_nl_np;
-    T* dy = delta_xy_nram + 2 * nq_nl_np;
-    T* dy_1 = delta_xy_nram + 3 * nq_nl_np;
-    T* buf_1 = compute_buffer_nram;
-    T* buf_2 = compute_buffer_nram + nq_nl_np;
-    T* buf_3 = compute_buffer_nram + 2 * nq_nl_np;
-    __memcpy(delta_xy_nram, delta_xy_sram + sram_src_offset, copy_size,
-             SRAM2NRAM, copy_size, src_stride, 3);
-    __memcpy(grad_wp_nram, grad_wp_sram + sram_src_offset, copy_size, SRAM2NRAM,
-             copy_size, src_stride, 3);
-    __bang_fusion(FUSION_FSM, buf_1, grad_wp_3, grad_wp_1, dy, nq_nl_np,
-                  nq_nl_np);
-    __bang_fusion(FUSION_FSM, buf_2, grad_wp_4, grad_wp_2, dy_1, nq_nl_np,
-                  nq_nl_np);
-    __bang_add(buf_1, buf_1, buf_2, nq_nl_np);
-    __bang_cycle_mul(buf_1, buf_1, spatial_w_bd_nram, nq_nl_np,
-                     num_levels_points);
-    __bang_fusion(FUSION_FSM, buf_2, grad_wp_3, grad_wp_4, dx, nq_nl_np,
-                  nq_nl_np);
-    __bang_fusion(FUSION_FSM, buf_3, grad_wp_1, grad_wp_2, dx_1, nq_nl_np,
-                  nq_nl_np);
-    __bang_add(buf_2, buf_2, buf_3, nq_nl_np);
-    __bang_cycle_mul(buf_2, buf_2, spatial_h_bd_nram, nq_nl_np,
-                     num_levels_points);
-    __bang_transpose(buf_3, buf_1, 2,
-                     nq_nl_np);  // (2, nq_nl_np) -> (nq_nl_np, 2)
-    __memcpy(grad_loc_gdram + i * max_deal_n * input_stride_3 * 2, buf_3,
-             input_stride_2 * 2 * sizeof(T), NRAM2GDRAM,
-             input_stride_3 * 2 * sizeof(T), input_stride_2 * 2 * sizeof(T),
-             deal_n - 1);
-  }
-}
-
-#endif
-
-__mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardFastKernel(
-    const float* data_value, const int32_t* spatial_shapes,
-    const int32_t* data_level_start_index, const float* data_sampling_loc,
-    const float* data_attn_weight, const float* grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float* grad_value, float* grad_sampling_loc,
-    float* grad_attn_weight) {
-#if (__BANG_ARCH__ == 592)
-  using T = float;
-  const int32_t num_keys = spatial_size;
-  const int32_t input_stride_4 =
-      num_query * num_heads * num_levels * num_points;
-  const int32_t input_stride_3 = num_heads * num_levels * num_points;
-  const int32_t input_stride_2 = num_levels * num_points;
-  const int32_t output_stride_3 = num_query * num_heads * channels;
-  const int32_t output_stride_2 = num_heads * channels;
-  const int32_t data_value_stride_3 = num_keys * num_heads * channels;
-
-  int32_t* seq_nram = nullptr;            // (1024)
-  T* zeros_nram = nullptr;                // (channels)
-  int32_t* data_offset_nram = nullptr;    // (4, deal_n, num_levels, num_points)
-  T* weight_polation_nram = nullptr;      // (4, deal_n, num_levels, num_points)
-  T* cond_point_polation_nram = nullptr;  // (4, deal_n, num_levels, num_points)
-  T* cond_point_valid_nram = nullptr;     // (deal_n, num_levels, num_points)
-  T* loc_nram = nullptr;                  // (deal_n, num_levels, num_points, 2)
-  T* buf_nram = nullptr;                  // (6, deal_n, num_levels, num_points)
-  T* buf_nram_end = nullptr;
-  int8_t* mask_x_nram = nullptr;  // (deal_n, num_levels, num_points, 2) / 8
-  int8_t* mask_y_nram = nullptr;  // (deal_n, num_levels, num_points, 2) / 8
-  T* spatial_offset_bd_nram = nullptr;     // (num_levels, num_points)
-  T* spatial_w_bd_nram = nullptr;          // (num_levels, num_points)
-  T* spatial_h_bd_nram = nullptr;          // (num_levels, num_points)
-  int32_t* spatial_offset_nram = nullptr;  // (num_levels)
-  int32_t* spatial_hw_nram = nullptr;      // (num_levels, 2)
-  T* compute_buffer_nram_stg2 =
-      nullptr;  // (deal_n, num_levels, num_points, channels)
-  T* weight_polation_nram_stg2 =
-      nullptr;                          // (4, deal_n, num_levels, num_points)
-  T* delta_xy_nram = nullptr;           // (4, deal_n, num_levels, num_points)
-  T* weight_attn_nram_stg2 = nullptr;   // (1, deal_n, num_levels, num_points)
-  int32_t* offset_nram_stg2 = nullptr;  // (4, deal_n, num_levels, num_points)
-  T* grad_output_nram = nullptr;        // (deal_n, channels)
-  T* cond_nram_stg2 = nullptr;          // (4, deal_n, num_levels, num_points)
-  T* compute_buffer_nram_stg3 =
-      nullptr;                      // (4, max_deal_n, num_levels, num_points)
-  T* delta_xy_nram_stg3 = nullptr;  // (4, max_deal_n, num_levels, num_points)
-  T* grad_wp_nram_stg3 = nullptr;
-  T* value_sram = nullptr;  // (num_keys, channels)
-  int32_t* data_offset_sram = nullptr;
-  T* weight_polation_sram = nullptr;
-  T* grad_wp_sram = nullptr;
-  T* weight_attn_sram = nullptr;
-  T* cond_point_polation_sram = nullptr;
-  T* delta_xy_sram = nullptr;
-  int8_t* bit_cond_nram = nullptr;          // (4, pad_points / 8)
-  int8_t* bit_cond_reverse_nram = nullptr;  // (4, pad_points / 8)
-  int32_t stage_1_max_deal_n = 0;
-  int32_t stage_2_max_deal_n = 0;
-  int32_t stage_3_max_deal_n = 0;
-  int32_t max_cached_n = 0;
-  int32_t mask_size = 0;
-  memPolicyBackward(
-      seq_nram, zeros_nram, data_offset_nram, weight_polation_nram,
-      cond_point_polation_nram, cond_point_valid_nram, delta_xy_nram, loc_nram,
-      buf_nram, buf_nram_end, mask_x_nram, mask_y_nram, spatial_offset_bd_nram,
-      spatial_w_bd_nram, spatial_h_bd_nram, spatial_offset_nram,
-      spatial_hw_nram, compute_buffer_nram_stg2, weight_polation_nram_stg2,
-      weight_attn_nram_stg2, offset_nram_stg2, grad_output_nram, bit_cond_nram,
-      bit_cond_reverse_nram, cond_nram_stg2, compute_buffer_nram_stg3,
-      delta_xy_nram_stg3, grad_wp_nram_stg3, data_offset_sram,
-      weight_polation_sram, grad_wp_sram, weight_attn_sram,
-      cond_point_polation_sram, delta_xy_sram, nram_buffer, sram_buffer,
-      max_cached_n, stage_1_max_deal_n, stage_2_max_deal_n, stage_3_max_deal_n,
-      mask_size, NRAM_AVALIABLE_SIZE, SRAM_AVALIABLE_SIZE, batch, num_keys,
-      num_heads, channels, num_levels, num_query, num_points);
-
-  if (stage_1_max_deal_n <= 0 || stage_2_max_deal_n <= 0) {
-    return;
-  }
-
-  int32_t cluster_begin_batch_head = 0;
-  int32_t cluster_act_batch_head = 0;
-  int32_t cluster_end_batch_head = 0;
-  int32_t core_begin_query = 0;
-  int32_t core_act_query = 0;
-  int32_t core_loop_num = 0;
-  int32_t core_step_query = 0;
-  splitTaskV2(cluster_begin_batch_head, cluster_act_batch_head,
-              cluster_end_batch_head, core_begin_query, core_act_query,
-              core_loop_num, core_step_query, max_cached_n, batch, num_keys,
-              num_heads, channels, num_levels, num_query, num_points);
-
-  prepareLoopV2(seq_nram, zeros_nram, spatial_offset_nram, spatial_hw_nram,
-                mask_x_nram, mask_y_nram, spatial_offset_bd_nram,
-                spatial_h_bd_nram, spatial_w_bd_nram, value_sram,
-                data_level_start_index, spatial_shapes, num_keys, num_levels,
-                num_points, stage_1_max_deal_n, mask_size, channels);
-
-  for (int32_t bh_idx = cluster_begin_batch_head;
-       bh_idx < cluster_end_batch_head; bh_idx++) {
-    int32_t b = bh_idx / num_heads;
-    int32_t head_idx = bh_idx % num_heads;
-    size_t output_base_offset =
-        (size_t)b * output_stride_3 + head_idx * channels;
-    size_t attn_weight_base_offset =
-        (size_t)b * input_stride_4 + head_idx * input_stride_2;
-    size_t data_value_base_offset =
-        (size_t)b * data_value_stride_3 + head_idx * channels;
-
-    for (int32_t i = 0; __is_ipu() && i < core_loop_num; i++) {
-      int32_t deal_n =
-          std::min(core_act_query - core_step_query * i, core_step_query);
-      int32_t core_query_offset = i * core_step_query;
-      size_t attn_weight_offset =
-          attn_weight_base_offset +
-          (core_begin_query + core_query_offset) * input_stride_3;
-      size_t loc_offset = attn_weight_offset * 2;
-      size_t output_offset =
-          output_base_offset +
-          (core_begin_query + core_query_offset) * output_stride_2;
-
-      // compute offset/cond/wp
-      stageOneLoop((T*)data_sampling_loc + loc_offset,
-                   (T*)data_attn_weight + attn_weight_offset, data_offset_nram,
-                   delta_xy_nram, weight_polation_nram,
-                   cond_point_polation_nram, cond_point_valid_nram, loc_nram,
-                   buf_nram, buf_nram_end, mask_x_nram, mask_y_nram,
-                   spatial_offset_bd_nram, spatial_w_bd_nram, spatial_h_bd_nram,
-                   spatial_offset_nram, spatial_hw_nram, data_offset_sram,
-                   delta_xy_sram, weight_polation_sram, weight_attn_sram,
-                   cond_point_polation_sram, true, true, deal_n,
-                   stage_1_max_deal_n, num_heads, channels, num_levels,
-                   num_points, input_stride_2, input_stride_3);
-
-      // compute grad_value/grad_attn_w
-      backwardStageTwoLoop(
-          seq_nram, compute_buffer_nram_stg2, zeros_nram,
-          weight_polation_nram_stg2, weight_attn_nram_stg2, offset_nram_stg2,
-          cond_nram_stg2, bit_cond_nram, bit_cond_reverse_nram,
-          grad_output_nram, delta_xy_nram, data_offset_sram,
-          weight_polation_sram, grad_wp_sram, weight_attn_sram,
-          cond_point_polation_sram, delta_xy_sram,
-          (T*)data_value + data_value_base_offset,
-          (T*)grad_output + output_offset,
-          (T*)grad_value + data_value_base_offset,
-          (T*)grad_attn_weight + attn_weight_offset, wram_buffer, deal_n,
-          stage_2_max_deal_n, input_stride_2, input_stride_3, output_stride_2,
-          num_heads, channels, num_levels, num_points);
-
-      // caompute grad_loc
-      backwardStageThreeLoop(
-          compute_buffer_nram_stg3, delta_xy_nram_stg3, grad_wp_nram_stg3,
-          spatial_h_bd_nram, spatial_w_bd_nram, delta_xy_sram, grad_wp_sram,
-          (T*)grad_sampling_loc + loc_offset, deal_n, stage_3_max_deal_n,
-          input_stride_2, input_stride_3, output_stride_2, num_heads, channels,
-          num_levels, num_points);
-    }
-  }
-#endif
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardFast(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const float* data_value, const int32_t* spatial_shapes,
-    const int32_t* data_level_start_index, const float* data_sampling_loc,
-    const float* data_attn_weight, const float* grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float* grad_value, float* grad_sampling_loc,
-    float* grad_attn_weight) {
-  KERNEL_CHECK(
-      MLUUnion1KernelMsDeformAttnBackwardFastKernel<<<k_dim, k_type, queue>>>(
-          data_value, spatial_shapes, data_level_start_index, data_sampling_loc,
-          data_attn_weight, grad_output, batch, spatial_size, num_heads,
-          channels, num_levels, num_query, num_points, grad_value,
-          grad_sampling_loc, grad_attn_weight));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu b/kernels/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu
deleted file mode 100644
index 1237cfc53..000000000
--- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu
+++ /dev/null
@@ -1,1012 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "ms_deform_attn_backward.h"
-
-#include "core/logging.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-#define ALIGN_NUM 32
-
-void __mlu_func__ computeGridMaskAndOffset(
-    float *nram_grad_output_tl, float *nram_grad_output_tr, float *nram_loc_w,
-    float *nram_loc_h, float *nram_h_stride, int32_t *nram_spatial_shapes,
-    float *nram_w_low_temp, float *nram_h_high_temp, float *nram_w_low,
-    float *nram_h_low, float *nram_h_high, float *nram_w_high, float *nram_lh,
-    float *nram_lw, float *nram_hh, float *nram_hw,
-    float *nram_h_low_ptr_offset, float *nram_h_high_ptr_offset,
-    float *nram_w_low_ptr_offset, float *nram_w_high_ptr_offset, float *nram_w1,
-    float *nram_w2, float *nram_w3, float *nram_w4, float *nram_offset_temp,
-    float *nram_offset1, float *nram_offset2, float *nram_offset3,
-    float *nram_offset4, float *nram_base_ptr, float *nram_h_low_temp,
-    const int32_t &num_deal_grid, const int32_t &num_per_time_real,
-    const int32_t &num_heads, const int32_t &num_levels,
-    const int32_t &num_points, const int32_t &w_stride,
-    const int32_t &qid_stride, float *grad_temp1) {
-  // [num_levels, 2] --> [2, num_levels]
-#if __BANG_ARCH__ >= 372
-  __bang_transpose(nram_grad_output_tl, nram_loc_w, num_deal_grid,
-                   2);  // 2 * xhlp
-  __bang_transpose(nram_loc_w, nram_grad_output_tl,
-                   num_per_time_real * num_heads * num_levels, num_points);
-  __bang_transpose(nram_loc_h, nram_grad_output_tl + num_deal_grid,
-                   num_per_time_real * num_heads * num_levels, num_points);
-
-  __bang_transpose((int32_t *)nram_grad_output_tr,
-                   (int32_t *)nram_spatial_shapes, num_levels, 2);
-  __bang_mul_scalar((int32_t *)nram_h_stride,
-                    (int32_t *)((int32_t *)nram_grad_output_tr + num_levels),
-                    w_stride, num_levels);
-
-  __memcpy_async((int32_t *)nram_spatial_shapes, (int32_t *)nram_grad_output_tr,
-                 num_levels * 2 * sizeof(int32_t), NRAM2NRAM);
-  __bang_int322float((float *)nram_spatial_shapes,
-                     (int32_t *)nram_spatial_shapes, num_levels * 2, 0);
-  __bang_cycle_mul((float *)nram_loc_w, (float *)nram_loc_w,
-                   (float *)(nram_spatial_shapes + num_levels), num_deal_grid,
-                   num_levels);
-  __bang_cycle_mul((float *)nram_loc_h, (float *)nram_loc_h,
-                   (float *)(nram_spatial_shapes), num_deal_grid, num_levels);
-  __bang_sub_scalar((float *)nram_loc_w, (float *)nram_loc_w, 0.5,
-                    num_deal_grid);
-  __bang_sub_scalar((float *)nram_loc_h, (float *)nram_loc_h, 0.5,
-                    num_deal_grid);
-
-  // get mask. (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-  __bang_cycle_lt((float *)nram_w_low_temp, (float *)nram_loc_w,
-                  (float *)(nram_spatial_shapes + num_levels), num_deal_grid,
-                  num_levels);
-  __bang_cycle_lt((float *)nram_h_high_temp, (float *)nram_loc_h,
-                  (float *)(nram_spatial_shapes), num_deal_grid, num_levels);
-
-  __bang_and((float *)nram_w_low_temp, (float *)nram_w_low_temp,
-             (float *)nram_h_high_temp, num_deal_grid);
-  __bang_gt_scalar((float *)nram_h_high_temp, (float *)nram_loc_h, -1,
-                   num_deal_grid);
-  __bang_and((float *)nram_h_high_temp, (float *)nram_h_high_temp,
-             (float *)nram_w_low_temp, num_deal_grid);
-  __bang_gt_scalar((float *)nram_w_low_temp, (float *)nram_loc_w, -1,
-                   num_deal_grid);
-  __bang_and((float *)nram_h_high_temp, (float *)nram_h_high_temp,
-             (float *)nram_w_low_temp, num_deal_grid);
-
-  __bang_transpose((float *)nram_w_low_temp, (float *)nram_h_high_temp,
-                   num_points, num_per_time_real * num_heads * num_levels);
-  __memcpy_async((float *)nram_h_high_temp, (float *)nram_w_low_temp,
-                 num_deal_grid * sizeof(float), NRAM2NRAM);
-
-  __bang_transpose((float *)nram_grad_output_tl, (float *)nram_loc_w,
-                   num_points, num_per_time_real * num_heads * num_levels);
-  __memcpy_async((float *)nram_loc_w, (float *)nram_grad_output_tl,
-                 num_deal_grid * sizeof(float), NRAM2NRAM);
-  __bang_transpose((float *)nram_grad_output_tl, (float *)nram_loc_h,
-                   num_points, num_per_time_real * num_heads * num_levels);
-  __memcpy_async((float *)nram_loc_h, (float *)nram_grad_output_tl,
-                 num_deal_grid * sizeof(float), NRAM2NRAM);
-
-  __bang_floor(nram_w_low, nram_loc_w, num_deal_grid);
-  __bang_floor(nram_h_low, nram_loc_h, num_deal_grid);
-  __bang_sub((float *)nram_lh, (float *)nram_loc_h, (float *)nram_h_low,
-             num_deal_grid);
-  __bang_sub((float *)nram_lw, (float *)nram_loc_w, (float *)nram_w_low,
-             num_deal_grid);
-  __bang_fusion(FUSION_FMA, nram_hh, nram_lh, (float)(-1), 1, num_deal_grid);
-  __bang_fusion(FUSION_FMA, nram_hw, nram_lw, (float)(-1), 1, num_deal_grid);
-  __bang_float2int32((int32_t *)nram_w_low, nram_w_low, num_deal_grid, 0);
-  __bang_float2int32((int32_t *)nram_h_low, nram_h_low, num_deal_grid, 0);
-
-  __bang_add_scalar((int32_t *)nram_h_high, (int32_t *)nram_h_low, 1,
-                    num_deal_grid);
-  __bang_add_scalar((int32_t *)nram_w_high, (int32_t *)nram_w_low, 1,
-                    num_deal_grid);
-
-  __bang_transpose((int32_t *)nram_h_low_ptr_offset, (int32_t *)nram_h_low,
-                   num_per_time_real * num_heads * num_levels, num_points);
-  __bang_cycle_mul((int32_t *)nram_h_low_ptr_offset,
-                   (int32_t *)nram_h_low_ptr_offset, (int32_t *)nram_h_stride,
-                   num_deal_grid, num_levels);
-  __bang_cycle_add((int32_t *)nram_h_high_ptr_offset,
-                   (int32_t *)nram_h_low_ptr_offset, (int32_t *)nram_h_stride,
-                   num_deal_grid, num_levels);
-
-  __bang_transpose((int32_t *)nram_w_low_ptr_offset,
-                   (int32_t *)nram_h_low_ptr_offset, num_points,
-                   num_per_time_real * num_heads * num_levels);
-
-  __memcpy_async((int32_t *)nram_h_low_ptr_offset,
-                 (int32_t *)nram_w_low_ptr_offset,
-                 num_deal_grid * sizeof(int32_t), NRAM2NRAM);
-  __bang_transpose((int32_t *)nram_w_low_ptr_offset,
-                   (int32_t *)nram_h_high_ptr_offset, num_points,
-                   num_per_time_real * num_heads * num_levels);
-  __memcpy_async((int32_t *)nram_h_high_ptr_offset,
-                 (int32_t *)nram_w_low_ptr_offset,
-                 num_deal_grid * sizeof(int32_t), NRAM2NRAM);
-  __bang_mul_scalar((int32_t *)nram_w_low_ptr_offset, (int32_t *)nram_w_low,
-                    qid_stride, num_deal_grid);
-  __bang_add_scalar((int32_t *)nram_w_high_ptr_offset,
-                    (int32_t *)nram_w_low_ptr_offset, qid_stride,
-                    num_deal_grid);
-
-  __bang_add((int32_t *)nram_offset1, (int32_t *)nram_h_low_ptr_offset,
-             (int32_t *)nram_w_low_ptr_offset, num_deal_grid);
-
-  __bang_transpose((int32_t *)nram_offset_temp, (int32_t *)nram_offset1,
-                   num_per_time_real * num_heads, num_levels * num_points);
-  __bang_cycle_add((int32_t *)nram_offset_temp, (int32_t *)nram_offset_temp,
-                   (int32_t *)nram_base_ptr, num_deal_grid, num_heads);
-
-  __bang_transpose((int32_t *)nram_offset1, (int32_t *)nram_offset_temp,
-                   num_levels * num_points, num_per_time_real * num_heads);
-
-  __bang_add((int32_t *)nram_offset2, (int32_t *)nram_h_low_ptr_offset,
-             (int32_t *)nram_w_high_ptr_offset, num_deal_grid);
-  __bang_transpose((int32_t *)nram_offset_temp, (int32_t *)nram_offset2,
-                   num_per_time_real * num_heads, num_levels * num_points);
-  __bang_cycle_add((int32_t *)nram_offset_temp, (int32_t *)nram_offset_temp,
-                   (int32_t *)nram_base_ptr, num_deal_grid, num_heads);
-  __bang_transpose((int32_t *)nram_offset2, (int32_t *)nram_offset_temp,
-                   num_levels * num_points, num_per_time_real * num_heads);
-
-  __bang_add((int32_t *)nram_offset3, (int32_t *)nram_h_high_ptr_offset,
-             (int32_t *)nram_w_low_ptr_offset, num_deal_grid);
-  __bang_transpose((int32_t *)nram_offset_temp, (int32_t *)nram_offset3,
-                   num_per_time_real * num_heads, num_levels * num_points);
-  __bang_cycle_add((int32_t *)nram_offset_temp, (int32_t *)nram_offset_temp,
-                   (int32_t *)nram_base_ptr, num_deal_grid, num_heads);
-  __bang_transpose((int32_t *)nram_offset3, (int32_t *)nram_offset_temp,
-                   num_levels * num_points, num_per_time_real * num_heads);
-  __bang_add((int32_t *)nram_offset4, (int32_t *)nram_h_high_ptr_offset,
-             (int32_t *)nram_w_high_ptr_offset, num_deal_grid);
-  __bang_transpose((int32_t *)nram_offset_temp, (int32_t *)nram_offset4,
-                   num_per_time_real * num_heads, num_levels * num_points);
-  __bang_cycle_add((int32_t *)nram_offset_temp, (int32_t *)nram_offset_temp,
-                   (int32_t *)nram_base_ptr, num_deal_grid, num_heads);
-  __bang_transpose((int32_t *)nram_offset4, (int32_t *)nram_offset_temp,
-                   num_levels * num_points, num_per_time_real * num_heads);
-
-  // h_low >= 0 && w_low >= 0  mask2
-  float *mask1 = nram_h_low_ptr_offset;
-  float *mask2 = nram_h_high_ptr_offset;
-  float *mask3 = nram_w_low_ptr_offset;
-  float *mask4 = nram_w_high_ptr_offset;
-
-  __bang_int322float(nram_w_low, (int32_t *)nram_w_low, num_deal_grid, 0);
-  __bang_int322float(nram_h_low, (int32_t *)nram_h_low, num_deal_grid, 0);
-
-  __bang_ge_scalar(mask1, nram_h_low, 0, num_deal_grid);
-  __bang_ge_scalar(mask2, nram_w_low, 0, num_deal_grid);
-  __bang_and(mask2, mask1, mask2, num_deal_grid);
-
-  __bang_and(mask2, nram_h_high_temp, mask2, num_deal_grid);
-
-  // h_low >= 0 && w_high <= width - 1 mask1
-  __bang_int322float(nram_w_high, (int32_t *)nram_w_high, num_deal_grid, 0);
-
-  __bang_transpose(mask3, nram_w_high,
-                   num_per_time_real * num_heads * num_levels, num_points);
-
-  __bang_sub_scalar((float *)nram_spatial_shapes, (float *)nram_spatial_shapes,
-                    1, num_levels * 2);
-
-  __bang_cycle_le((float *)mask3, (float *)mask3,
-                  (float *)(nram_spatial_shapes + num_levels), num_deal_grid,
-                  num_levels);
-  __bang_transpose(mask4, mask3, num_points,
-                   num_per_time_real * num_heads * num_levels);
-  __bang_and(mask1, mask1, mask4, num_deal_grid);
-  __bang_and(mask1, nram_h_high_temp, mask1, num_deal_grid);
-
-  // h_high <= height - 1 && w_high <= width - 1 mask3
-  __bang_int322float(nram_h_high, (int32_t *)nram_h_high, num_deal_grid, 0);
-  __bang_transpose(mask3, nram_h_high,
-                   num_per_time_real * num_heads * num_levels, num_points);
-
-  __bang_cycle_le((float *)mask3, (float *)mask3,
-                  (float *)(nram_spatial_shapes), num_deal_grid, num_levels);
-
-  __bang_transpose(nram_h_low_temp, mask3, num_points,
-                   num_per_time_real * num_heads * num_levels);
-  __bang_and(mask4, mask4, nram_h_low_temp, num_deal_grid);
-  __bang_and(mask3, mask4, nram_h_high_temp, num_deal_grid);
-
-  // h_high <= height - 1 && w_low >= 0 mask4
-  __bang_ge_scalar(nram_w_low_temp, nram_w_low, 0, num_deal_grid);
-  __bang_and(mask4, nram_h_low_temp, nram_w_low_temp, num_deal_grid);
-  __bang_and(mask4, mask4, nram_h_high_temp, num_deal_grid);
-  __bang_int322float(nram_offset1, (int32_t *)nram_offset1, num_deal_grid, 0);
-  __bang_gt_scalar(grad_temp1, nram_offset1, 0, num_deal_grid);
-
-  __bang_mul(nram_offset1, nram_offset1, grad_temp1, num_deal_grid);
-  __bang_mul(nram_offset1, nram_offset1, mask2, num_deal_grid);
-  __bang_float2int32((int32_t *)nram_offset1, nram_offset1, num_deal_grid, 0);
-
-  __bang_int322float((float *)nram_offset2, (int32_t *)nram_offset2,
-                     num_deal_grid, 0);
-  __bang_gt_scalar((float *)grad_temp1, (float *)nram_offset2, 0,
-                   num_deal_grid);
-
-  __bang_mul(nram_offset2, nram_offset2, grad_temp1, num_deal_grid);
-  __bang_mul(nram_offset2, nram_offset2, mask1, num_deal_grid);
-  __bang_float2int32((int32_t *)nram_offset2, nram_offset2, num_deal_grid, 0);
-
-  __bang_int322float((float *)nram_offset3, (int32_t *)nram_offset3,
-                     num_deal_grid, 0);
-  __bang_gt_scalar((float *)grad_temp1, (float *)nram_offset3, 0,
-                   num_deal_grid);
-
-  __bang_mul(nram_offset3, nram_offset3, grad_temp1, num_deal_grid);
-  __bang_mul(nram_offset3, nram_offset3, mask4, num_deal_grid);
-  __bang_float2int32((int32_t *)nram_offset3, nram_offset3, num_deal_grid, 0);
-
-  __bang_int322float((float *)nram_offset4, (int32_t *)nram_offset4,
-                     num_deal_grid, 0);
-  __bang_gt_scalar((float *)grad_temp1, (float *)nram_offset4, 0,
-                   num_deal_grid);
-
-  __bang_mul(nram_offset4, nram_offset4, grad_temp1, num_deal_grid);
-  __bang_mul(nram_offset4, nram_offset4, mask3, num_deal_grid);
-  __bang_float2int32((int32_t *)nram_offset4, nram_offset4, num_deal_grid, 0);
-  __sync_io_move_compute();
-
-  __bang_mul(nram_w1, nram_hh, nram_hw, num_deal_grid);
-  __bang_mul(nram_w2, nram_hh, nram_lw, num_deal_grid);
-  __bang_mul(nram_w3, nram_lh, nram_hw, num_deal_grid);
-  __bang_mul(nram_w4, nram_lh, nram_lw, num_deal_grid);
-#endif
-}
-
-void __mlu_func__ loadValue(
-    float *nram_grad_output_tl, float *nram_grad_output_tr,
-    float *nram_grad_output_bl, float *nram_grad_output_br,
-    const float *data_value, float *grad_temp1, float *grad_temp3, float *mask1,
-    float *mask2, float *mask3, float *mask4, float *nram_offset1,
-    float *nram_offset2, float *nram_offset3, float *nram_offset4,
-    float *nram_grad_weight, int32_t *nram_level_start_index,
-    const int32_t &offset_nram, const int32_t &num_heads,
-    const int32_t &deal_num_real, const int32_t &num_deal_grid,
-    const int32_t &num_query, const int32_t &num_levels,
-    const int32_t &num_points, const int32_t &grid_offset,
-    const int32_t &spatial_size, const int32_t &qid_stride) {
-#if __BANG_ARCH__ >= 372
-  int32_t value_offset_temp = 0;
-
-#if __BANG_ARCH__ >= 592
-  for (int i = 0; i < num_deal_grid; ++i) {
-    int32_t b_col =
-        (grid_offset + i) / num_query / num_heads / num_levels / num_points;
-    int32_t l_col = (grid_offset + i) / num_points % num_levels;
-    int32_t level_start_id = nram_level_start_index[l_col];
-    value_offset_temp =
-        b_col * spatial_size * qid_stride + level_start_id * qid_stride;
-    ((int32_t *)grad_temp1)[i] = value_offset_temp;
-  }
-
-  __bang_add((int32_t *)grad_temp3, (int32_t *)grad_temp1,
-             (int32_t *)nram_offset1, num_deal_grid);
-  __bang_add((int32_t *)(grad_temp3 + num_deal_grid), (int32_t *)grad_temp1,
-             (int32_t *)nram_offset2, num_deal_grid);
-  __bang_add((int32_t *)(grad_temp3 + 2 * num_deal_grid), (int32_t *)grad_temp1,
-             (int32_t *)nram_offset3, num_deal_grid);
-  __bang_add((int32_t *)(grad_temp3 + 3 * num_deal_grid), (int32_t *)grad_temp1,
-             (int32_t *)nram_offset4, num_deal_grid);
-  __bang_mul_scalar((int32_t *)grad_temp3, (int32_t *)grad_temp3,
-                    sizeof(int32_t), 4 * num_deal_grid);
-  __sync_io_move_compute();
-
-  __gather_async((void *)nram_grad_output_tl, (void *)data_value,
-                 (unsigned int *)grad_temp3, deal_num_real * sizeof(float),
-                 GDRAM2NRAM, deal_num_real * sizeof(float), num_deal_grid);
-
-  __gather_async((void *)nram_grad_output_tr, (void *)data_value,
-                 (unsigned int *)(grad_temp3 + num_deal_grid),
-                 deal_num_real * sizeof(float), GDRAM2NRAM,
-                 deal_num_real * sizeof(float), num_deal_grid);
-
-  __gather_async((void *)nram_grad_output_bl, (void *)data_value,
-                 (unsigned int *)(grad_temp3 + 2 * num_deal_grid),
-                 deal_num_real * sizeof(float), GDRAM2NRAM,
-                 deal_num_real * sizeof(float), num_deal_grid);
-
-  __gather_async((void *)nram_grad_output_br, (void *)data_value,
-                 (unsigned int *)(grad_temp3 + 3 * num_deal_grid),
-                 deal_num_real * sizeof(float), GDRAM2NRAM,
-                 deal_num_real * sizeof(float), num_deal_grid);
-  __sync_io_move_compute();
-
-#else
-  int32_t b_col =
-      (grid_offset) / num_query / num_heads / num_levels / num_points;
-  int32_t l_col = (grid_offset) / num_points % num_levels;
-  int32_t level_start_id = nram_level_start_index[l_col];
-  value_offset_temp =
-      b_col * spatial_size * qid_stride + level_start_id * qid_stride;
-  for (int32_t loop = 0; loop < num_deal_grid; ++loop) {
-    __memcpy_async((void *)(nram_grad_output_tl + loop * deal_num_real),
-                   (void *)(data_value + value_offset_temp +
-                            (((int32_t *)nram_offset1)[loop])),
-                   deal_num_real * sizeof(float), GDRAM2NRAM,
-                   offset_nram * sizeof(float),
-                   ((((int32_t *)nram_offset2)[loop]) -
-                    (((int32_t *)nram_offset1)[loop])) *
-                       sizeof(float),
-                   mask1[loop]);
-    b_col = (grid_offset + loop + 1) / num_query / num_heads / num_levels /
-            num_points;
-    l_col = (grid_offset + loop + 1) / num_points % num_levels;
-    level_start_id = nram_level_start_index[l_col];
-
-    __memcpy_async((void *)(nram_grad_output_bl + loop * deal_num_real),
-                   (void *)(data_value + value_offset_temp +
-                            (((int32_t *)nram_offset3)[loop])),
-                   deal_num_real * sizeof(float), GDRAM2NRAM,
-                   offset_nram * sizeof(float),
-                   ((((int32_t *)nram_offset4)[loop]) -
-                    (((int32_t *)nram_offset3)[loop])) *
-                       sizeof(float),
-                   mask3[loop]);
-    value_offset_temp =
-        b_col * spatial_size * qid_stride + level_start_id * qid_stride;
-  }
-#endif
-  __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
-  __bang_cycle_add(grad_temp1, grad_temp1, mask2, deal_num_real * num_deal_grid,
-                   num_deal_grid);
-  __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
-  __nram__ int32_t table[64] = {0, (int32_t)0xffffffff};
-  __bang_float2int32((int32_t *)grad_temp3, grad_temp3,
-                     num_deal_grid * deal_num_real, 0);
-  __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table,
-                 num_deal_grid * deal_num_real, 64);
-  __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
-  __bang_cycle_add(grad_temp1, grad_temp1, mask1, deal_num_real * num_deal_grid,
-                   num_deal_grid);
-  __sync_io_move_compute();
-
-  __bang_band((char *)nram_grad_output_tl, (char *)nram_grad_output_tl,
-              (char *)grad_temp3,
-              num_deal_grid * deal_num_real * sizeof(float));
-  __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
-
-  __bang_float2int32((int32_t *)grad_temp3, grad_temp3,
-                     num_deal_grid * deal_num_real, 0);
-  __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table,
-                 num_deal_grid * deal_num_real, 64);
-  __bang_band((char *)nram_grad_output_tr, (char *)nram_grad_output_tr,
-              (char *)grad_temp3,
-              num_deal_grid * deal_num_real * sizeof(float));
-
-  __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
-  __bang_cycle_add(grad_temp1, grad_temp1, mask4, deal_num_real * num_deal_grid,
-                   num_deal_grid);
-  __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
-
-  __bang_float2int32((int32_t *)grad_temp3, grad_temp3,
-                     num_deal_grid * deal_num_real, 0);
-  __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table,
-                 num_deal_grid * deal_num_real, 64);
-  __bang_band((char *)nram_grad_output_bl, (char *)nram_grad_output_bl,
-              (char *)grad_temp3,
-              num_deal_grid * deal_num_real * sizeof(float));
-
-  __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
-  __bang_cycle_add(grad_temp1, grad_temp1, mask3, deal_num_real * num_deal_grid,
-                   num_deal_grid);
-  __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
-
-  __bang_float2int32((int32_t *)grad_temp3, grad_temp3,
-                     num_deal_grid * deal_num_real, 0);
-  __bang_lut_s32((int32_t *)grad_temp3, (int32_t *)grad_temp3, (int32_t *)table,
-                 num_deal_grid * deal_num_real, 64);
-  __bang_band((char *)nram_grad_output_br, (char *)nram_grad_output_br,
-              (char *)grad_temp3,
-              num_deal_grid * deal_num_real * sizeof(float));
-#endif
-}
-
-void __mlu_func__ computeGradValue(
-    float *grad_temp1, float *grad_temp2, float *grad_temp3, float *grad_temp4,
-    float *mask1, float *mask2, float *mask3, float *mask4, float *nram_offset1,
-    float *nram_offset2, float *nram_offset3, float *nram_offset4,
-    int32_t *nram_level_start_index, int32_t deal_num_real,
-    const float *grad_value, float *nram_w1, float *nram_w2, float *nram_w3,
-    float *nram_w4, const int32_t &num_per_time_real, const int32_t &num_heads,
-    const int32_t &num_levels, const int32_t &num_points,
-    const int32_t &num_query, const int32_t &num_deal_grid,
-    const int32_t &grid_offset, const int32_t &spatial_size,
-    const int32_t &qid_stride, float *nram_grid_offset1,
-    float *nram_grid_offset2, const int32_t &batch, float *nram_grad_output_tl,
-    float *nram_grad_output_tr, float *nram_grad_output_bl,
-    float *nram_grad_output_br, float *nram_grad_weight) {
-#if __BANG_ARCH__ >= 372
-  __bang_write_zero(grad_temp1, deal_num_real * num_deal_grid);
-  __bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight,
-                   deal_num_real * num_deal_grid, num_deal_grid);
-  __bang_transpose(grad_temp3, grad_temp1,
-                   deal_num_real * num_per_time_real * num_heads,
-                   num_levels * num_points);
-  __bang_transpose(grad_temp1, grad_temp2, num_per_time_real * num_heads,
-                   deal_num_real);
-  __bang_cycle_mul(grad_temp3, grad_temp3, grad_temp1,
-                   num_deal_grid * deal_num_real,
-                   deal_num_real * num_per_time_real * num_heads);
-  __bang_transpose(grad_temp4, grad_temp3, num_levels * num_points,
-                   deal_num_real * num_per_time_real * num_heads);
-
-  int32_t temp_res = num_query * num_heads * num_levels * num_points;
-  for (int32_t loop = 0; loop < num_deal_grid; ++loop) {
-    ((int32_t *)nram_grid_offset1)[loop] = ((loop + grid_offset) / temp_res);
-  }
-  __bang_mul_scalar((int32_t *)nram_grid_offset1, (int32_t *)nram_grid_offset1,
-                    spatial_size * qid_stride, num_deal_grid);
-  __bang_transpose((int32_t *)nram_grid_offset2, (int32_t *)nram_grid_offset1,
-                   num_per_time_real * num_heads * num_levels, num_points);
-
-  __bang_mul_scalar((int32_t *)nram_grid_offset1,
-                    (int32_t *)nram_level_start_index, qid_stride, num_levels);
-  __bang_cycle_add((int32_t *)nram_grid_offset2, (int32_t *)nram_grid_offset2,
-                   (int32_t *)nram_grid_offset1, num_deal_grid, num_levels);
-  __bang_transpose((int32_t *)nram_grid_offset1, (int32_t *)nram_grid_offset2,
-                   num_points, num_per_time_real * num_heads * num_levels);
-
-  __bang_add((int32_t *)nram_offset1, (int32_t *)nram_offset1,
-             (int32_t *)nram_grid_offset1, num_deal_grid);
-  __bang_add((int32_t *)nram_offset2, (int32_t *)nram_offset2,
-             (int32_t *)nram_grid_offset1, num_deal_grid);
-  __bang_add((int32_t *)nram_offset3, (int32_t *)nram_offset3,
-             (int32_t *)nram_grid_offset1, num_deal_grid);
-  __bang_add((int32_t *)nram_offset4, (int32_t *)nram_offset4,
-             (int32_t *)nram_grid_offset1, num_deal_grid);
-
-#if __BANG_ARCH__ >= 592
-  // make sure offset not great than (batch * spatial_size * num_heads *
-  // channels)
-  __bang_lt_scalar((int32_t *)grad_temp1, (int32_t *)nram_offset1,
-                   batch * spatial_size * num_heads * deal_num_real,
-                   num_deal_grid);
-  __bang_mul((int32_t *)nram_offset1, (int32_t *)nram_offset1,
-             (int32_t *)grad_temp1, num_deal_grid);
-  __bang_lt_scalar((int32_t *)grad_temp1, (int32_t *)nram_offset2,
-                   batch * spatial_size * num_heads * deal_num_real,
-                   num_deal_grid);
-  __bang_mul((int32_t *)nram_offset2, (int32_t *)nram_offset2,
-             (int32_t *)grad_temp1, num_deal_grid);
-  __bang_lt_scalar((int32_t *)grad_temp1, (int32_t *)nram_offset3,
-                   batch * spatial_size * num_heads * deal_num_real,
-                   num_deal_grid);
-  __bang_mul((int32_t *)nram_offset3, (int32_t *)nram_offset3,
-             (int32_t *)grad_temp1, num_deal_grid);
-  __bang_lt_scalar((int32_t *)grad_temp1, (int32_t *)nram_offset4,
-                   batch * spatial_size * num_heads * deal_num_real,
-                   num_deal_grid);
-  __bang_mul((int32_t *)nram_offset4, (int32_t *)nram_offset4,
-             (int32_t *)grad_temp1, num_deal_grid);
-  __bang_mul(grad_temp3, nram_w1, mask2, num_deal_grid);
-  __bang_cycle_mul(grad_temp1, grad_temp4, grad_temp3,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
-
-  for (int32_t loop = 0; loop < num_deal_grid; ++loop) {
-    __bang_atomic_reduce_add(
-        (float *)(grad_value + ((int32_t *)nram_offset1)[loop]),
-        (float *)(grad_temp3 + loop * deal_num_real), deal_num_real);
-  }
-
-  __bang_mul(grad_temp3, nram_w2, mask1, num_deal_grid);
-  __bang_cycle_mul(grad_temp1, grad_temp4, grad_temp3,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
-
-  for (int32_t loop = 0; loop < num_deal_grid; ++loop) {
-    __bang_atomic_reduce_add(
-        (float *)(grad_value + +((int32_t *)nram_offset2)[loop]),
-        (float *)(grad_temp3 + loop * deal_num_real), deal_num_real);
-  }
-
-  __bang_mul(grad_temp3, nram_w3, mask4, num_deal_grid);
-  __bang_cycle_mul(grad_temp1, grad_temp4, grad_temp3,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-
-  __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
-
-  for (int32_t loop = 0; loop < num_deal_grid; ++loop) {
-    __bang_atomic_reduce_add(
-        (float *)(grad_value + ((int32_t *)nram_offset3)[loop]),
-        (float *)(grad_temp3 + loop * deal_num_real), deal_num_real);
-  }
-  __bang_mul(grad_temp3, nram_w4, mask3, num_deal_grid);
-  __bang_cycle_mul(grad_temp1, grad_temp4, grad_temp3,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_transpose(grad_temp3, grad_temp1, deal_num_real, num_deal_grid);
-
-  for (int32_t loop = 0; loop < num_deal_grid; ++loop) {
-    __bang_atomic_reduce_add(
-        (float *)(grad_value + ((int32_t *)nram_offset4)[loop]),
-        (float *)(grad_temp3 + loop * deal_num_real), deal_num_real);
-  }
-#else
-  __bang_cycle_mul(grad_temp1, grad_temp4, nram_w1,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_transpose(nram_grad_output_br, grad_temp1, deal_num_real,
-                   num_deal_grid);
-
-  __bang_cycle_mul(grad_temp1, grad_temp4, nram_w2,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_transpose(nram_grad_output_tl, grad_temp1, deal_num_real,
-                   num_deal_grid);
-
-  __bang_cycle_mul(grad_temp1, grad_temp4, nram_w3,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_transpose(nram_grad_output_tr, grad_temp1, deal_num_real,
-                   num_deal_grid);
-
-  __bang_cycle_mul(grad_temp1, grad_temp4, nram_w4,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_transpose(nram_grad_output_bl, grad_temp1, deal_num_real,
-                   num_deal_grid);
-  for (int32_t loop = 0; loop < num_deal_grid; ++loop) {
-    if (mask2[loop]) {
-      __bang_atomic_reduce_add(
-          (float *)(grad_value + int32_t(((int32_t *)nram_offset1)[loop])),
-          (float *)(nram_grad_output_br + loop * deal_num_real), deal_num_real);
-    }
-    if (mask1[loop]) {
-      __bang_atomic_reduce_add(
-          (float *)(grad_value + ((int32_t *)nram_offset2)[loop]),
-          (float *)(nram_grad_output_tl + loop * deal_num_real), deal_num_real);
-    }
-    if (mask4[loop]) {
-      __bang_atomic_reduce_add(
-          (float *)(grad_value + ((int32_t *)nram_offset3)[loop]),
-          (float *)(nram_grad_output_tr + loop * deal_num_real), deal_num_real);
-    }
-
-    if (mask3[loop]) {
-      __bang_atomic_reduce_add(
-          (float *)(grad_value + ((int32_t *)nram_offset4)[loop]),
-          (float *)(nram_grad_output_bl + loop * deal_num_real), deal_num_real);
-    }
-  }
-#endif
-#endif
-}
-
-void __mlu_func__ computeGradAttnWeight(
-    float *grad_w_weight, float *grad_weight, float *nram_grad_output_tl,
-    float *nram_grad_output_tr, float *nram_grad_output_bl,
-    float *nram_grad_output_br, float *grad_temp2,
-    const float *grad_attn_weight, float *nram_hw, float *nram_hh,
-    float *nram_lw, float *nram_lh, float *grad_h_weight, float *nram_w1,
-    float *nram_w2, float *nram_w3, float *nram_w4, const int32_t &offset_nram,
-    const int32_t &num_deal_grid, const int32_t &deal_num_real,
-    const int32_t &num_per_time_real, const int32_t &num_heads,
-    const int32_t &num_levels, const int32_t &num_points,
-    const int32_t &grid_offset, float *nram_h_high_temp) {
-  __bang_write_zero(grad_w_weight, 2 * offset_nram);
-  // grad_output_nram_tl
-#if __BANG_ARCH__ >= 372
-  __bang_transpose(grad_weight, nram_grad_output_tl, num_deal_grid,
-                   deal_num_real);
-  __bang_cycle_mul(nram_grad_output_tl, grad_weight, nram_hw,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_sub(grad_h_weight, grad_h_weight, nram_grad_output_tl,
-             num_deal_grid * deal_num_real);
-  __bang_cycle_mul(nram_grad_output_tl, grad_weight, nram_hh,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_sub(grad_w_weight, grad_w_weight, nram_grad_output_tl,
-             num_deal_grid * deal_num_real);
-  __bang_cycle_mul(nram_grad_output_tl, grad_weight, nram_w1,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  // nram_grad_output_tr
-  __bang_transpose(grad_weight, nram_grad_output_tr, num_deal_grid,
-                   deal_num_real);
-  __bang_cycle_mul(nram_grad_output_tr, grad_weight, nram_lw,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_sub(grad_h_weight, grad_h_weight, nram_grad_output_tr,
-             num_deal_grid * deal_num_real);
-  __bang_cycle_mul(nram_grad_output_tr, grad_weight, nram_hh,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_add(grad_w_weight, grad_w_weight, nram_grad_output_tr,
-             num_deal_grid * deal_num_real);
-  __bang_cycle_mul(nram_grad_output_tr, grad_weight, nram_w2,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_add(nram_grad_output_tl, nram_grad_output_tl, nram_grad_output_tr,
-             num_deal_grid * deal_num_real);
-
-  // nram_grad_output_tl
-  __bang_transpose(grad_weight, nram_grad_output_bl, num_deal_grid,
-                   deal_num_real);
-  __bang_cycle_mul(nram_grad_output_bl, grad_weight, nram_hw,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_add(grad_h_weight, grad_h_weight, nram_grad_output_bl,
-             num_deal_grid * deal_num_real);
-  __bang_cycle_mul(nram_grad_output_bl, grad_weight, nram_lh,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_sub(grad_w_weight, grad_w_weight, nram_grad_output_bl,
-             num_deal_grid * deal_num_real);
-  __bang_cycle_mul(nram_grad_output_bl, grad_weight, nram_w3,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-
-  __bang_add(nram_grad_output_tl, nram_grad_output_tl, nram_grad_output_bl,
-             num_deal_grid * deal_num_real);
-
-  // nram_grad_output_br
-  __bang_transpose(grad_weight, nram_grad_output_br, num_deal_grid,
-                   deal_num_real);
-  __bang_cycle_mul(nram_grad_output_br, grad_weight, nram_lw,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_add(grad_h_weight, grad_h_weight, nram_grad_output_br,
-             num_deal_grid * deal_num_real);
-  __bang_cycle_mul(nram_grad_output_br, grad_weight, nram_lh,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_add(grad_w_weight, grad_w_weight, nram_grad_output_br,
-             num_deal_grid * deal_num_real);
-  __bang_cycle_mul(nram_grad_output_br, grad_weight, nram_w4,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_add(nram_grad_output_tl, nram_grad_output_tl, nram_grad_output_br,
-             num_deal_grid * deal_num_real);
-
-  __bang_transpose(nram_grad_output_br, nram_grad_output_tl, deal_num_real,
-                   num_deal_grid);
-  __bang_transpose(nram_grad_output_tr, nram_grad_output_br,
-                   num_per_time_real * num_heads,
-                   num_points * num_levels * deal_num_real);
-  __bang_transpose(grad_weight, grad_temp2, num_per_time_real * num_heads,
-                   deal_num_real);
-  __bang_cycle_mul(nram_grad_output_tr, nram_grad_output_tr, grad_weight,
-                   num_deal_grid * deal_num_real,
-                   num_per_time_real * num_heads * deal_num_real);
-  __bang_transpose(nram_grad_output_br, nram_grad_output_tr,
-                   num_points * num_levels * deal_num_real,
-                   num_per_time_real * num_heads);
-  __bang_transpose((float *)nram_grad_output_tr, (float *)nram_grad_output_br,
-                   num_deal_grid, deal_num_real);
-
-  __mluop_recursive_sum_pool(nram_grad_output_tr, num_deal_grid, deal_num_real,
-                             ALIGN_NUM);
-
-  __bang_float2int32((int32_t *)nram_h_high_temp, nram_h_high_temp,
-                     num_deal_grid, 0);
-  __nram__ int32_t table[64] = {0, (int32_t)0xffffffff};
-  __bang_lut_s32((int32_t *)nram_h_high_temp, (int32_t *)nram_h_high_temp,
-                 (int32_t *)table, num_deal_grid, 64);
-  __bang_band((char *)nram_grad_output_tr, (char *)nram_grad_output_tr,
-              (char *)nram_h_high_temp, num_deal_grid * sizeof(float));
-  __bang_atomic_reduce_add((float *)grad_attn_weight + grid_offset,
-                           (float *)nram_grad_output_tr, num_deal_grid);
-#endif
-}
-
-void __mlu_func__ computeGradSampingLoc(
-    const float *grad_sampling_loc, float *nram_grad_output_tl,
-    float *nram_grad_output_tr, float *grad_h_weight, float *grad_w_weight,
-    int32_t *nram_spatial_shapes, float *grad_temp1, float *grad_temp2,
-    float *nram_grad_weight, const int32_t &num_deal_grid,
-    const int32_t &deal_num_real, const int32_t &num_per_time_real,
-    const int32_t &num_heads, const int32_t &num_levels,
-    const int32_t &num_points, const int32_t &grid_offset,
-    float *nram_h_high_temp) {
-#if __BANG_ARCH__ >= 372
-  __bang_add_scalar((float *)nram_spatial_shapes, (float *)nram_spatial_shapes,
-                    1.0, 2 * num_levels);
-  __bang_transpose(nram_grad_output_tl, grad_h_weight,
-                   num_per_time_real * num_heads * num_levels * deal_num_real,
-                   num_points);  // pcxhl
-  __bang_cycle_mul(nram_grad_output_tl, nram_grad_output_tl,
-                   (float *)nram_spatial_shapes, num_deal_grid * deal_num_real,
-                   num_levels);
-  __bang_transpose(grad_h_weight, nram_grad_output_tl,
-                   num_points * deal_num_real,
-                   num_per_time_real * num_heads * num_levels);
-
-  __bang_write_zero(grad_temp1, num_deal_grid * deal_num_real);
-  __bang_cycle_add(grad_temp1, grad_temp1, nram_grad_weight,
-                   num_deal_grid * deal_num_real, num_deal_grid);
-  __bang_transpose(nram_grad_output_tr, grad_temp1,
-                   deal_num_real * num_per_time_real * num_heads,
-                   num_levels * num_points);
-  __bang_transpose(grad_temp1, grad_temp2, num_per_time_real * num_heads,
-                   deal_num_real);
-  __bang_cycle_mul(nram_grad_output_tr, nram_grad_output_tr, grad_temp1,
-                   num_deal_grid * deal_num_real,
-                   deal_num_real * num_per_time_real * num_heads);
-  __bang_transpose(grad_temp1, nram_grad_output_tr,
-                   num_levels * num_points * deal_num_real,
-                   num_per_time_real * num_heads);
-
-  __bang_mul(grad_h_weight, grad_h_weight, grad_temp1,
-             num_deal_grid * deal_num_real);
-  __bang_transpose(nram_grad_output_tl, grad_h_weight, num_deal_grid,
-                   deal_num_real);
-  __memcpy_async(grad_h_weight, nram_grad_output_tl,
-                 num_deal_grid * deal_num_real * sizeof(float), NRAM2NRAM);
-  __mluop_recursive_sum_pool(grad_h_weight, num_deal_grid, deal_num_real,
-                             ALIGN_NUM);
-
-  __nram__ int32_t table[64] = {0, (int32_t)0xffffffff};
-  __bang_lut_s32((int32_t *)nram_h_high_temp, (int32_t *)nram_h_high_temp,
-                 (int32_t *)table, num_deal_grid, 64);
-  __bang_band((char *)grad_h_weight, (char *)grad_h_weight,
-              (char *)nram_h_high_temp, num_deal_grid * sizeof(float));
-
-  __bang_transpose(nram_grad_output_tl, grad_w_weight,
-                   num_per_time_real * num_heads * num_levels * deal_num_real,
-                   num_points);  // pcxhl
-  __bang_cycle_mul(nram_grad_output_tl, nram_grad_output_tl,
-                   (float *)(nram_spatial_shapes + num_levels),
-                   num_deal_grid * deal_num_real, num_levels);
-  __bang_transpose(grad_w_weight, nram_grad_output_tl,
-                   num_points * deal_num_real,
-                   num_per_time_real * num_heads * num_levels);
-
-  __bang_mul(grad_w_weight, grad_w_weight, grad_temp1,
-             num_deal_grid * deal_num_real);
-  __bang_transpose(nram_grad_output_tl, grad_w_weight, num_deal_grid,
-                   deal_num_real);
-  __memcpy_async(grad_w_weight, nram_grad_output_tl,
-                 num_deal_grid * deal_num_real * sizeof(float), NRAM2NRAM);
-  __mluop_recursive_sum_pool(grad_w_weight, num_deal_grid, deal_num_real,
-                             ALIGN_NUM);
-  __bang_lut_s32((int32_t *)nram_h_high_temp, (int32_t *)nram_h_high_temp,
-                 (int32_t *)table, num_deal_grid, 64);
-  __bang_band((char *)grad_w_weight, (char *)grad_w_weight,
-              (char *)nram_h_high_temp, num_deal_grid * sizeof(float));
-
-  __memcpy_async(grad_w_weight + num_deal_grid, grad_h_weight,
-                 num_deal_grid * sizeof(float), NRAM2NRAM);
-  __bang_transpose(nram_grad_output_tl, grad_w_weight, 2, num_deal_grid);
-  __bang_atomic_reduce_add((float *)grad_sampling_loc + grid_offset * 2,
-                           (float *)nram_grad_output_tl, 2 * num_deal_grid);
-#endif
-}
-
-__mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardSmallChannelsKernel(
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight) {
-  const int32_t split_grid_num = 28;
-  const int32_t split_num_c = 8;
-  const int32_t C_align = PAD_UP(channels, ALIGN_NUM);
-
-  const int32_t num_hlp = num_heads * num_levels * num_points;
-  int32_t num_per_time_theory =
-      (MAX_NRAM_SIZE - num_levels * sizeof(float) -
-       3 * PAD_UP(num_levels, 32) * sizeof(int32_t)) /
-      sizeof(float) / (split_num_c * C_align + split_grid_num) / (num_hlp);
-
-  int32_t deal_grid_num_theory = num_per_time_theory * num_hlp;
-
-  const int32_t offset_nram = num_per_time_theory * C_align * num_hlp;
-  const int32_t offset_nram_calc = PAD_UP(deal_grid_num_theory, ALIGN_NUM);
-  float *nram_grad_output_tl = (float *)nram_buffer;
-  float *nram_grad_output_tr = (float *)nram_buffer + offset_nram;
-  float *nram_grad_output_bl = (float *)nram_buffer + 2 * offset_nram;
-  float *nram_grad_output_br = (float *)nram_buffer + 3 * offset_nram;
-
-  float *grad_temp1 = (float *)nram_buffer + 4 * offset_nram;
-  float *grad_temp2 = (float *)nram_buffer + 5 * offset_nram;
-  float *grad_temp3 = (float *)nram_buffer + 6 * offset_nram;
-  float *grad_temp4 = (float *)nram_buffer + 7 * offset_nram;
-
-  float *nram_loc_w = (float *)nram_buffer + split_num_c * offset_nram;
-  float *nram_loc_h =
-      (float *)nram_buffer + split_num_c * offset_nram + offset_nram_calc;
-  float *nram_h_low =
-      (float *)nram_buffer + split_num_c * offset_nram + 2 * offset_nram_calc;
-  float *nram_w_low =
-      (float *)nram_buffer + split_num_c * offset_nram + 3 * offset_nram_calc;
-  float *nram_h_high =
-      (float *)nram_buffer + split_num_c * offset_nram + 4 * offset_nram_calc;
-  float *nram_w_high =
-      (float *)nram_buffer + split_num_c * offset_nram + 5 * offset_nram_calc;
-  float *nram_h_low_temp =
-      (float *)nram_buffer + split_num_c * offset_nram + 6 * offset_nram_calc;
-  float *nram_h_high_temp =
-      (float *)nram_buffer + split_num_c * offset_nram + 7 * offset_nram_calc;
-
-  float *nram_hw =
-      (float *)nram_buffer + split_num_c * offset_nram + 8 * offset_nram_calc;
-  float *nram_hh =
-      (float *)nram_buffer + split_num_c * offset_nram + 9 * offset_nram_calc;
-  float *nram_lw =
-      (float *)nram_buffer + split_num_c * offset_nram + 10 * offset_nram_calc;
-  float *nram_lh =
-      (float *)nram_buffer + split_num_c * offset_nram + 11 * offset_nram_calc;
-
-  float *nram_h_low_ptr_offset =
-      (float *)nram_buffer + split_num_c * offset_nram + 12 * offset_nram_calc;
-  float *nram_h_high_ptr_offset =
-      (float *)nram_buffer + split_num_c * offset_nram + 13 * offset_nram_calc;
-  float *nram_w_low_ptr_offset =
-      (float *)nram_buffer + split_num_c * offset_nram + 14 * offset_nram_calc;
-  float *nram_w_high_ptr_offset =
-      (float *)nram_buffer + split_num_c * offset_nram + 15 * offset_nram_calc;
-
-  float *nram_w1 =
-      (float *)nram_buffer + split_num_c * offset_nram + 16 * offset_nram_calc;
-  float *nram_w2 =
-      (float *)nram_buffer + split_num_c * offset_nram + 17 * offset_nram_calc;
-  float *nram_w3 =
-      (float *)nram_buffer + split_num_c * offset_nram + 18 * offset_nram_calc;
-  float *nram_w4 =
-      (float *)nram_buffer + split_num_c * offset_nram + 19 * offset_nram_calc;
-
-  float *nram_grad_weight =
-      (float *)nram_buffer + split_num_c * offset_nram + 20 * offset_nram_calc;
-  float *nram_base_ptr =
-      (float *)nram_buffer + split_num_c * offset_nram + 21 * offset_nram_calc;
-  float *nram_offset_temp =
-      (float *)nram_buffer + split_num_c * offset_nram + 22 * offset_nram_calc;
-
-  float *nram_offset1 =
-      (float *)nram_buffer + split_num_c * offset_nram + 23 * offset_nram_calc;
-  float *nram_offset2 =
-      (float *)nram_buffer + split_num_c * offset_nram + 24 * offset_nram_calc;
-  float *nram_offset3 =
-      (float *)nram_buffer + split_num_c * offset_nram + 25 * offset_nram_calc;
-  float *nram_offset4 =
-      (float *)nram_buffer + split_num_c * offset_nram + 26 * offset_nram_calc;
-
-  float *nram_w_low_temp =
-      (float *)nram_buffer + split_num_c * offset_nram + 27 * offset_nram_calc;
-  int32_t *nram_spatial_shapes =
-      (int32_t *)((float *)nram_buffer + split_num_c * offset_nram +
-                  28 * offset_nram_calc);
-  int32_t *nram_level_start_index =
-      (int32_t *)(nram_spatial_shapes + 2 * PAD_UP(num_levels, 32));
-  float *nram_h_stride =
-      (float *)(nram_level_start_index + 3 * PAD_UP(num_levels, 32));
-
-  const int32_t total_num = batch * num_query;
-  int32_t num_per_core = total_num / taskDim;
-  int32_t num_rem = total_num % taskDim;
-  num_per_core = num_per_core + int32_t(taskId < num_rem);
-  num_per_time_theory =
-      num_per_core > num_per_time_theory ? num_per_time_theory : num_per_core;
-  int32_t num_deal_grid = num_per_time_theory * num_hlp;
-
-  if (num_per_core == 0) return;
-  int32_t start_per_core = num_rem > taskId ? (taskId * num_per_core)
-                                            : (num_rem + taskId * num_per_core);
-
-  const int32_t qid_stride = num_heads * channels;
-  int32_t deal_num_real = channels;
-
-  const int32_t repeat_times = num_per_core / num_per_time_theory;
-  const int32_t tail_num = num_per_core % num_per_time_theory;
-
-  int32_t num_per_time_real = num_per_time_theory;
-
-  for (int32_t loop = 0; loop < num_heads; ++loop) {
-    ((int32_t *)nram_base_ptr)[loop] = loop * channels;
-  }
-  const int32_t w_stride = num_heads * channels;
-  for (int32_t grid_loop = 0; grid_loop < repeat_times + 1; ++grid_loop) {
-    int32_t grid_offset =
-        (start_per_core + grid_loop * num_per_time_theory) * num_hlp;
-    if (grid_loop == repeat_times) {
-      if (tail_num == 0) {
-        continue;
-      } else {
-        grid_offset =
-            (start_per_core + repeat_times * num_per_time_theory) * num_hlp;
-        num_per_time_real = tail_num;
-        num_deal_grid = tail_num * num_hlp;
-      }
-    }
-    __memcpy_async(nram_spatial_shapes, spatial_shapes,
-                   num_levels * 2 * sizeof(int32_t), GDRAM2NRAM);
-
-    __memcpy_async(nram_loc_w, data_sampling_loc + grid_offset * 2,
-                   num_deal_grid * 2 * sizeof(float), GDRAM2NRAM);
-
-    __sync_io_move_compute();
-    __memcpy_async(nram_grad_weight, data_attn_weight + grid_offset,
-                   num_deal_grid * sizeof(float), GDRAM2NRAM);
-    __memcpy_async(nram_level_start_index, data_level_start_index,
-                   num_levels * sizeof(int32_t), GDRAM2NRAM);
-    computeGridMaskAndOffset(
-        nram_grad_output_tl, nram_grad_output_tr, nram_loc_w, nram_loc_h,
-        nram_h_stride, nram_spatial_shapes, nram_w_low_temp, nram_h_high_temp,
-        nram_w_low, nram_h_low, nram_h_high, nram_w_high, nram_lh, nram_lw,
-        nram_hh, nram_hw, nram_h_low_ptr_offset, nram_h_high_ptr_offset,
-        nram_w_low_ptr_offset, nram_w_high_ptr_offset, nram_w1, nram_w2,
-        nram_w3, nram_w4, nram_offset_temp, nram_offset1, nram_offset2,
-        nram_offset3, nram_offset4, nram_base_ptr, nram_h_low_temp,
-        num_deal_grid, num_per_time_real, num_heads, num_levels, num_points,
-        w_stride, qid_stride, grad_temp1);
-    float *mask1 = nram_h_low_ptr_offset;
-    float *mask2 = nram_h_high_ptr_offset;
-    float *mask3 = nram_w_low_ptr_offset;
-    float *mask4 = nram_w_high_ptr_offset;
-    __memcpy_async(
-        grad_temp2,
-        grad_output + (start_per_core + grid_loop * num_per_time_theory) *
-                          num_heads * deal_num_real,
-        num_per_time_real * num_heads * deal_num_real * sizeof(float),
-        GDRAM2NRAM);
-    loadValue(nram_grad_output_tl, nram_grad_output_tr, nram_grad_output_bl,
-              nram_grad_output_br, data_value, grad_temp1, grad_temp3, mask1,
-              mask2, mask3, mask4, nram_offset1, nram_offset2, nram_offset3,
-              nram_offset4, nram_grad_weight, nram_level_start_index,
-              offset_nram, num_heads, deal_num_real, num_deal_grid, num_query,
-              num_levels, num_points, grid_offset, spatial_size, qid_stride);
-
-    // compute grad_weight
-    float *grad_weight = grad_temp1;
-    float *grad_h_weight = grad_temp4;
-    float *grad_w_weight = grad_temp3;
-    computeGradAttnWeight(
-        grad_w_weight, grad_weight, nram_grad_output_tl, nram_grad_output_tr,
-        nram_grad_output_bl, nram_grad_output_br, grad_temp2, grad_attn_weight,
-        nram_hw, nram_hh, nram_lw, nram_lh, grad_h_weight, nram_w1, nram_w2,
-        nram_w3, nram_w4, offset_nram, num_deal_grid, deal_num_real,
-        num_per_time_real, num_heads, num_levels, num_points, grid_offset,
-        nram_h_high_temp);
-
-    // compute grad_sampling_loc
-    computeGradSampingLoc(grad_sampling_loc, nram_grad_output_tl,
-                          nram_grad_output_tr, grad_h_weight, grad_w_weight,
-                          nram_spatial_shapes, grad_temp1, grad_temp2,
-                          nram_grad_weight, num_deal_grid, deal_num_real,
-                          num_per_time_real, num_heads, num_levels, num_points,
-                          grid_offset, nram_h_high_temp);
-
-    float *nram_grid_offset1 = nram_loc_h;
-    float *nram_grid_offset2 = nram_loc_w;
-    computeGradValue(
-        grad_temp1, grad_temp2, grad_temp3, grad_temp4, mask1, mask2, mask3,
-        mask4, nram_offset1, nram_offset2, nram_offset3, nram_offset4,
-        nram_level_start_index, deal_num_real, grad_value, nram_w1, nram_w2,
-        nram_w3, nram_w4, num_per_time_real, num_heads, num_levels, num_points,
-        num_query, num_deal_grid, grid_offset, spatial_size, qid_stride,
-        nram_grid_offset1, nram_grid_offset2, batch, nram_grad_output_tl,
-        nram_grad_output_tr, nram_grad_output_bl, nram_grad_output_br,
-        nram_grad_weight);
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardSmallChannels(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight) {
-  KERNEL_CHECK(
-      MLUUnion1KernelMsDeformAttnBackwardSmallChannelsKernel<<<k_dim, k_type,
-                                                               queue>>>(
-          data_value, spatial_shapes, data_level_start_index, data_sampling_loc,
-          data_attn_weight, grad_output, batch, spatial_size, num_heads,
-          channels, num_levels, num_query, num_points, grad_value,
-          grad_sampling_loc, grad_attn_weight));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu b/kernels/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu
deleted file mode 100644
index d70448a51..000000000
--- a/kernels/ms_deform_attn_backward/ms_deform_attn_backward_union1.mlu
+++ /dev/null
@@ -1,296 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "ms_deform_attn_backward.h"
-
-#include "core/logging.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-#define likely(x) __builtin_expect((x), 1)
-#define ALIGN_NUM 64
-#define ALIGN_NUM_FOR_REDUCE 32
-#define LEN_FLOAT sizeof(float)
-
-template <typename T>
-void __mlu_func__ msDeformAttnCol2imBilinear(
-    T *top_grad_temp, const int32_t &height, const int32_t &width, const T &w1,
-    const T &w2, const T &w3, const T &w4, const int32_t &h_low,
-    const int32_t &w_low, const int32_t &h_high, const int32_t &w_high,
-    const int32_t &base_ptr, const int32_t &h_low_ptr_offset,
-    const int32_t &w_low_ptr_offset, const int32_t &h_high_ptr_offset,
-    const int32_t &w_high_ptr_offset, const T &hh, const T &hw, const T &lh,
-    const T &lw, T *top_grad, const T &data_attn_weight, T *grad_h_weight,
-    T *grad_w_weight, T *grad_value, T *grad_output_nram, T *grad_weight,
-    T *grad_sampling_loc, T *grad_attn_weight, T *grad_output_nram_temp,
-    const int32_t &deal_num, const int32_t &deal_num_real,
-    const T *data_value_ptr) {
-#if __BANG_ARCH__ >= 372
-  if (h_low >= 0 && w_low >= 0) {
-    int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram, data_value_ptr + offset1,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram, hw, deal_num_real);
-    __bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num_real);
-    __bang_mul_scalar(grad_weight, grad_output_nram, hh, deal_num_real);
-    __bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num_real);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w1, deal_num_real);
-    // for calc grad_attn_weight
-    __bang_mul_scalar(grad_output_nram, grad_output_nram, w1, deal_num_real);
-    __bang_atomic_reduce_add((T *)(grad_value + offset1), (T *)top_grad_temp,
-                             deal_num_real);
-  }
-  if (h_low >= 0 && w_high <= width - 1) {
-    int32_t offset2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram_temp, data_value_ptr + offset2,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num_real);
-    __bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num_real);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, hh, deal_num_real);
-    __bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num_real);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w2, deal_num_real);
-
-    __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w2,
-                      deal_num_real);
-    __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
-               deal_num_real);
-    __bang_atomic_reduce_add((T *)(grad_value + offset2), (T *)top_grad_temp,
-                             deal_num_real);
-  }
-  if (h_high <= height - 1 && w_low >= 0) {
-    int32_t offset3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram_temp, data_value_ptr + offset3,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, hw, deal_num_real);
-    __bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num_real);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num_real);
-    __bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num_real);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w3, deal_num_real);
-    // for calc grad_attn_weight
-    __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w3,
-                      deal_num_real);
-    __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
-               deal_num_real);
-    __bang_atomic_reduce_add((T *)(grad_value + offset3), (T *)top_grad_temp,
-                             deal_num_real);
-  }
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    int32_t offset4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram_temp, data_value_ptr + offset4,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num_real);
-    __bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num_real);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num_real);
-    __bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num_real);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w4, deal_num_real);
-    // for calc grad_attn_weight
-    __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w4,
-                      deal_num_real);
-    __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
-               deal_num_real);
-
-    __bang_atomic_reduce_add((T *)(grad_value + offset4), (T *)top_grad_temp,
-                             deal_num_real);
-  }
-  __bang_mul(grad_output_nram, grad_output_nram, top_grad, deal_num_real);
-  __mluop_recursive_sum_pool(grad_output_nram, 1, deal_num_real,
-                             ALIGN_NUM_FOR_REDUCE);
-  __bang_atomic_reduce_add((T *)grad_attn_weight, (T *)grad_output_nram, 1);
-  __bang_mul_scalar(grad_w_weight, grad_w_weight, width, deal_num_real);
-  __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num_real);
-  __bang_mul(grad_w_weight, grad_w_weight, top_grad_temp, deal_num_real);
-  __mluop_recursive_sum_pool(grad_w_weight, 1, deal_num_real,
-                             ALIGN_NUM_FOR_REDUCE);
-  __bang_atomic_reduce_add((T *)(grad_sampling_loc), (T *)grad_w_weight, 1);
-
-  __bang_mul_scalar(grad_h_weight, grad_h_weight, height, deal_num_real);
-  __bang_mul(grad_h_weight, grad_h_weight, top_grad_temp, deal_num_real);
-  __mluop_recursive_sum_pool(grad_h_weight, 1, deal_num_real,
-                             ALIGN_NUM_FOR_REDUCE);
-  __bang_atomic_reduce_add((T *)(grad_sampling_loc + 1), (T *)grad_h_weight, 1);
-#endif
-}
-
-__mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardDefault(
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight) {
-#if __BANG_ARCH__ != 520
-  if (__is_mpu()) {
-    return;
-  }
-  const int32_t split_num = 8;
-  const int32_t spatial_shapes_size = 64;
-
-  const int32_t deal_num = PAD_DOWN(
-      (MAX_NRAM_SIZE - spatial_shapes_size) / split_num / LEN_FLOAT, ALIGN_NUM);
-  float *grad_output_nram = (float *)nram_buffer;
-  float *grad_output_nram_temp = (float *)nram_buffer + deal_num;
-  float *grad_weight = (float *)nram_buffer + 2 * deal_num;
-  float *grad_h_weight = (float *)nram_buffer + 3 * deal_num;
-  float *grad_w_weight = (float *)nram_buffer + 4 * deal_num;
-  float *top_grad = (float *)nram_buffer + 5 * deal_num;
-  float *top_grad_temp = (float *)nram_buffer + 6 * deal_num;
-  int32_t *spatial_shapes_nram =
-      (int32_t *)((float *)nram_buffer + 7 * deal_num);
-  float *sampling_loc_nram =
-      (float *)nram_buffer + 7 * deal_num + 2 * sizeof(int32_t);
-  const int32_t total_num = batch * num_query * num_heads * num_levels;
-  int32_t num_per_core = total_num / taskDim;
-  int32_t num_rem = total_num % taskDim;
-  num_per_core = num_per_core + int32_t(taskId < num_rem);
-  int32_t start_per_core =
-      num_rem > taskId
-          ? (taskId * num_per_core)
-          : ((num_per_core + 1) * num_rem + (taskId - num_rem) * num_per_core);
-  int32_t end_per_core = start_per_core + num_per_core;
-  const int32_t C_repeat = channels / deal_num;
-  const int32_t C_tail = channels % deal_num;
-  const int32_t qid_stride = num_heads * channels;
-  for (int32_t num_loop = start_per_core; num_loop < end_per_core; ++num_loop) {
-    const int32_t l_col = num_loop % num_levels;
-    const int32_t m_col = num_loop / num_levels % num_heads;
-    const int32_t q_col = num_loop / num_levels / num_heads % num_query;
-    const int32_t b_col = num_loop / num_query / num_heads / num_levels;
-    int32_t data_weight_ptr = num_loop * num_points;
-    int32_t data_loc_w_ptr = data_weight_ptr << 1;
-    const int32_t value_offset = b_col * spatial_size * qid_stride;
-    const int32_t level_start_id = data_level_start_index[l_col];
-    const int32_t grad_attn_weight_out = num_loop * num_points;
-    int32_t spatial_h_ptr = l_col << 1;
-    int32_t grad_output_offset =
-        b_col * num_query * qid_stride + q_col * qid_stride + m_col * channels;
-    __memcpy(spatial_shapes_nram, spatial_shapes + spatial_h_ptr,
-             2 * sizeof(int32_t), GDRAM2NRAM);
-    const int32_t spatial_h = spatial_shapes_nram[0];
-    const int32_t spatial_w = spatial_shapes_nram[1];
-    const int32_t h_stride = spatial_w * qid_stride;
-    const int32_t value_ptr_offset = value_offset + level_start_id * qid_stride;
-    const float *data_value_ptr = data_value + value_ptr_offset;
-    float *grad_value_ptr = grad_value + value_ptr_offset;
-
-    const int32_t grad_sampling_loc_out = num_loop * num_points << 1;
-    for (int32_t p_col = 0; p_col < num_points; ++p_col) {
-      __memcpy(sampling_loc_nram, data_sampling_loc + data_loc_w_ptr,
-               (LEN_FLOAT << 1), GDRAM2NRAM);
-      const float loc_w = sampling_loc_nram[0];
-      const float loc_h = sampling_loc_nram[1];
-      const float weight = data_attn_weight[data_weight_ptr];
-      const float h_im = loc_h * spatial_h - 0.5;
-      const float w_im = loc_w * spatial_w - 0.5;
-      if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-        const int32_t h_low = floorf(h_im);
-        const int32_t w_low = floorf(w_im);
-        const int32_t h_high = h_low + 1;
-        const int32_t w_high = w_low + 1;
-
-        const float lh = h_im - h_low;
-        const float lw = w_im - w_low;
-        const float hh = 1.0 - lh;
-        const float hw = 1.0 - lw;
-
-        const int32_t h_low_ptr_offset = h_low * h_stride;
-        const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
-        const int32_t w_low_ptr_offset = w_low * qid_stride;
-        const int32_t w_high_ptr_offset = w_low_ptr_offset + qid_stride;
-
-        const float w1 = hh * hw;
-        const float w2 = hh * lw;
-        const float w3 = lh * hw;
-        const float w4 = lh * lw;
-        if (likely(C_tail != 0)) {
-          const int32_t base_ptr = m_col * channels + C_repeat * deal_num;
-          __bang_write_zero(grad_h_weight, PAD_UP(channels, ALIGN_NUM));
-          __bang_write_zero(grad_w_weight, PAD_UP(channels, ALIGN_NUM));
-          __bang_write_zero(grad_output_nram, PAD_UP(channels, ALIGN_NUM));
-
-          __memcpy(top_grad,
-                   grad_output + grad_output_offset + C_repeat * deal_num,
-                   C_tail * LEN_FLOAT, GDRAM2NRAM);
-          msDeformAttnCol2imBilinear(
-              top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
-              h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
-              h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
-              weight, grad_h_weight, grad_w_weight, grad_value_ptr,
-              grad_output_nram, grad_weight,
-              grad_sampling_loc + grad_sampling_loc_out + (p_col << 1),
-              grad_attn_weight + grad_attn_weight_out + p_col,
-              grad_output_nram_temp, deal_num, C_tail, data_value_ptr);
-        }
-        for (int32_t C_loop = 0; C_loop < C_repeat; ++C_loop) {
-          const int32_t base_ptr = m_col * channels + C_loop * deal_num;
-          __bang_write_zero(grad_h_weight, PAD_UP(channels, ALIGN_NUM));
-          __bang_write_zero(grad_w_weight, PAD_UP(channels, ALIGN_NUM));
-          __bang_write_zero(grad_output_nram, PAD_UP(channels, ALIGN_NUM));
-          __memcpy(top_grad,
-                   grad_output + grad_output_offset + C_loop * deal_num,
-                   deal_num * LEN_FLOAT, GDRAM2NRAM);
-          msDeformAttnCol2imBilinear(
-              top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
-              h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
-              h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
-              weight, grad_h_weight, grad_w_weight, grad_value_ptr,
-              grad_output_nram, grad_weight,
-              grad_sampling_loc + grad_sampling_loc_out + (p_col << 1),
-              grad_attn_weight + grad_attn_weight_out + p_col,
-              grad_output_nram_temp, deal_num, deal_num, data_value_ptr);
-        }
-      }
-      data_weight_ptr += 1;
-      data_loc_w_ptr += 2;
-    }
-  }
-
-#endif
-}
-
-mluOpStatus_t MLUOP_WIN_API KernelMsDeformAttnBackwardDefault(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight) {
-  KERNEL_CHECK(
-      MLUUnion1KernelMsDeformAttnBackwardDefault<<<k_dim, k_type, queue>>>(
-          data_value, spatial_shapes, data_level_start_index, data_sampling_loc,
-          data_attn_weight, grad_output, batch, spatial_size, num_heads,
-          channels, num_levels, num_query, num_points, grad_value,
-          grad_sampling_loc, grad_attn_weight));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/ms_deform_attn_forward/ms_deform_attn_forward.h b/kernels/ms_deform_attn_forward/ms_deform_attn_forward.h
deleted file mode 100644
index 942601345..000000000
--- a/kernels/ms_deform_attn_forward/ms_deform_attn_forward.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_FORWARD_H_
-#define KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_FORWARD_H_
-
-#include "kernels/kernel.h"
-#include "mlu_op.h"
-
-#define MIN(a, b) (((a) < (b)) ? (a) : (b))
-#define MS_DEFORM_ATTN_FORWARD_HEADVECTOR 1
-
-template <typename T>
-__mlu_global__ void MLUKernelMsDeformAttnForwardDefault(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int batch_size, const int num_keys, const int num_heads,
-    const int channels, const int num_levels, const int num_queries,
-    const int num_points, char *data_col_gdram);
-
-template <typename T>
-__mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int batch_size, const int num_keys, const int num_heads,
-    const int channels, const int num_levels, const int num_queries,
-    const int num_points, char *data_col_gdram);
-
-template <typename T>
-__mlu_global__ void MLUKernelMsDeformAttnForwardFast(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram);
-
-#endif  // KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_FORWARD_H_
diff --git a/kernels/ms_deform_attn_forward/ms_deform_attn_forward.mlu b/kernels/ms_deform_attn_forward/ms_deform_attn_forward.mlu
deleted file mode 100644
index 22346b76d..000000000
--- a/kernels/ms_deform_attn_forward/ms_deform_attn_forward.mlu
+++ /dev/null
@@ -1,340 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/ms_deform_attn_forward/ms_deform_attn_forward.h"
-
-#include "core/context.h"
-#include "core/logging.h"
-#include "core/gen_case.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/tool.h"
-#include "core/type.h"
-#include "kernels/debug.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/cnnl_helper.h"
-
-typedef enum {
-  /*!< Index is invalid. */
-  MS_DEFORM_ATTN_FORWARD_INVALID = 0,
-  /*!< MLUKernelMsDeformAttnForwardDefault */
-  MS_DEFORM_ATTN_FORWARD_DEFAULT = 1,
-  /*!< MLUKernelMsDeformAttnForwardSmallChannel */
-  MS_DEFORM_ATTN_FORWARD_SMALL_CHANNEL = 2,
-  /*!< MLUKernelMsDeformAttnForwardFast */
-  MS_DEFORM_ATTN_FORWARD_FAST = 3,
-} MsDeformAttnForwardPolicy;
-
-MsDeformAttnForwardPolicy msDeformAttnForwardPolicyFunc(
-    const mluOpHandle_t handle, cnrtDim3_t *k_dims, cnrtFunctionType_t *k_type,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points) {
-  // start U1 task
-  k_dims->x = mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  k_dims->y =
-      MIN((batch_size * num_queries * num_heads + k_dims->x - 1) / k_dims->x,
-          mluop::runtime::getClusterLimitCapability(handle));
-  k_dims->z = 1;
-
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-
-  int32_t nlp = num_levels * num_points;
-  int32_t nlpc = num_levels * num_points * channels;
-
-  if (handle->arch == MLUOP_MLU370 && nlp <= 128 && nlpc <= 12288) {
-    return MS_DEFORM_ATTN_FORWARD_FAST;
-  } else if (handle->arch == MLUOP_MLU590 && nlp <= 128 && nlpc <= 8192) {
-    return MS_DEFORM_ATTN_FORWARD_FAST;
-  } else if (nlp * 3 * sizeof(int32_t) > handle->nram_size) {
-    return MS_DEFORM_ATTN_FORWARD_DEFAULT;
-  } else if (channels > handle->nram_size / 12 / sizeof(float) ||
-             channels > 96 || channels < 16) {
-    return MS_DEFORM_ATTN_FORWARD_DEFAULT;
-  } else {
-    return MS_DEFORM_ATTN_FORWARD_SMALL_CHANNEL;
-  }
-}
-
-static mluOpStatus_t paramcheck(
-    const mluOpTensorDescriptor_t data_value_desc,
-    const mluOpTensorDescriptor_t data_spatial_shapes_desc,
-    const mluOpTensorDescriptor_t data_level_start_index_desc,
-    const mluOpTensorDescriptor_t data_sampling_loc_desc,
-    const mluOpTensorDescriptor_t data_attn_weight_desc,
-    const mluOpTensorDescriptor_t data_col_desc) {
-  // check tensor dim
-  // params data_value: [bs, num_keys, num_heads, channels]
-  PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_value_desc->dim, 4);
-  // params data_spatial_shapes: [num_levels, 2]
-  PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_spatial_shapes_desc->dim,
-                 2);
-  PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]",
-                 data_spatial_shapes_desc->dims[1], 2);
-  // params data_level_start_index: [num_levels]
-  PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_level_start_index_desc->dim,
-                 1);
-  // params data_sampling_loc:
-  // [bs, num_queries, num_heads, num_levels, num_points, 2]
-  PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_sampling_loc_desc->dim, 6);
-  PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_sampling_loc_desc->dims[5],
-                 2);
-  // params data_attn_weight:
-  // [bs, num_queries, num_heads, num_levels, num_points]
-  PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_attn_weight_desc->dim, 5);
-  // params data_col: [bs, num_queries, num_heads, channels]
-  PARAM_CHECK_EQ("[mluOpMsDeformAttnForward]", data_col_desc->dim, 4);
-  // check tensor shape
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              (data_value_desc->dims[0] == data_col_desc->dims[0]) &&
-                  (data_sampling_loc_desc->dims[0] == data_col_desc->dims[0]) &&
-                  (data_attn_weight_desc->dims[0] == data_col_desc->dims[0]));
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              (data_value_desc->dims[2] == data_col_desc->dims[2]) &&
-                  (data_sampling_loc_desc->dims[2] == data_col_desc->dims[2]) &&
-                  (data_attn_weight_desc->dims[2] == data_col_desc->dims[2]));
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              data_value_desc->dims[3] == data_col_desc->dims[3]);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              (data_spatial_shapes_desc->dims[0] ==
-               data_level_start_index_desc->dims[0]) &&
-                  (data_spatial_shapes_desc->dims[0] ==
-                   data_sampling_loc_desc->dims[3]) &&
-                  (data_spatial_shapes_desc->dims[0] ==
-                   data_attn_weight_desc->dims[3]));
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              (data_sampling_loc_desc->dims[1] == data_col_desc->dims[1]) &&
-                  (data_attn_weight_desc->dims[1] == data_col_desc->dims[1]));
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_sampling_loc_desc->dims[4] ==
-                                                data_attn_weight_desc->dims[4]);
-  // check tensor datatype
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              data_value_desc->dtype == MLUOP_DTYPE_FLOAT);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              data_spatial_shapes_desc->dtype == MLUOP_DTYPE_INT32);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              data_level_start_index_desc->dtype == MLUOP_DTYPE_INT32);
-  // data_value, data_sampling_loc, data_attn_weight,
-  // data_col datatype must be the same
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              (data_value_desc->dtype == data_col_desc->dtype) &&
-                  (data_sampling_loc_desc->dtype == data_col_desc->dtype) &&
-                  (data_attn_weight_desc->dtype == data_col_desc->dtype));
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpMsDeformAttnForward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t data_value_desc,
-    const void *data_value,
-    const mluOpTensorDescriptor_t data_spatial_shapes_desc,
-    const void *data_spatial_shapes,
-    const mluOpTensorDescriptor_t data_level_start_index_desc,
-    const void *data_level_start_index,
-    const mluOpTensorDescriptor_t data_sampling_loc_desc,
-    const void *data_sampling_loc,
-    const mluOpTensorDescriptor_t data_attn_weight_desc,
-    const void *data_attn_weight, const int32_t im2col_step,
-    const mluOpTensorDescriptor_t data_col_desc, void *data_col) {
-  // handle and desc ptr check null
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", handle != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_value_desc != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_spatial_shapes_desc != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]",
-              data_level_start_index_desc != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_sampling_loc_desc != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_attn_weight_desc != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_col_desc != NULL);
-  // check params
-  mluOpStatus_t paramcheck_status = paramcheck(
-      data_value_desc, data_spatial_shapes_desc, data_level_start_index_desc,
-      data_sampling_loc_desc, data_attn_weight_desc, data_col_desc);
-  if (paramcheck_status != MLUOP_STATUS_SUCCESS) {
-    return paramcheck_status;
-  }
-  size_t data_value_element_num = mluOpGetTensorElementNum(data_value_desc);
-  size_t data_sampling_loc_element_num =
-      mluOpGetTensorElementNum(data_sampling_loc_desc);
-  size_t data_col_element_num = mluOpGetTensorElementNum(data_col_desc);
-  // check large tensor
-  TENSOR_NUM_CHECK("[mluOpMsDeformAttnForward]", data_value_element_num,
-                   LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK("[mluOpMsDeformAttnForward]", data_sampling_loc_element_num,
-                   LARGE_TENSOR_NUM, "");
-  TENSOR_NUM_CHECK("[mluOpMsDeformAttnForward]", data_col_element_num,
-                   LARGE_TENSOR_NUM, "");
-  const int32_t batch_size = data_value_desc->dims[0];
-  const int32_t num_keys = data_value_desc->dims[1];
-  const int32_t num_heads = data_value_desc->dims[2];
-  const int32_t channels = data_value_desc->dims[3];
-  const int32_t num_levels = data_spatial_shapes_desc->dims[0];
-  const int32_t num_queries = data_sampling_loc_desc->dims[1];
-  const int32_t num_points = data_sampling_loc_desc->dims[4];
-  // check element num zero
-  if (batch_size == 0 || num_heads == 0 || channels == 0 || num_queries == 0) {
-    LOG(ERROR) << "[mluOpMsDeformAttnForward] Check failed: element num zero.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (num_levels == 0 || num_points == 0) {
-    VLOG(5) << "cnnlFill_v3 start.";
-    const float fill_value = 0.0f;
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(data_col_desc,
-                                                 cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, data_col));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-    VLOG(5) << "cnnlFill_v3 end.";
-    VLOG(5) << "mluOpMsDeformAttnForward skip zero element.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-  // check im2col_step param
-  const int32_t im2col_step_ = MIN(batch_size, im2col_step);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", im2col_step_ > 0);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", batch_size % im2col_step_ == 0);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_value != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_spatial_shapes != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_level_start_index != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_sampling_loc != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_attn_weight != NULL);
-  PARAM_CHECK("[mluOpMsDeformAttnForward]", data_col != NULL);
-  // generate mluOpMsDeformAttnForward prototxt start!
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    GEN_CASE_START("ms_deform_attn_forward");
-    // set handle dump mlu output
-    GEN_CASE_HANDLE(handle);
-    GEN_CASE_DATA(true, "data_value", data_value, data_value_desc, 10, -10);
-    GEN_CASE_DATA(true, "data_spatial_shapes", data_spatial_shapes,
-                  data_spatial_shapes_desc, 10, -10);
-    GEN_CASE_DATA(true, "data_level_start_index", data_level_start_index,
-                  data_level_start_index_desc, 10, -10);
-    GEN_CASE_DATA(true, "data_sampling_loc", data_sampling_loc,
-                  data_sampling_loc_desc, 10, -10);
-    GEN_CASE_DATA(true, "data_attn_weight", data_attn_weight,
-                  data_attn_weight_desc, 10, -10);
-    GEN_CASE_DATA(false, "data_col", data_col, data_col_desc, 0, 0);
-    GEN_CASE_OP_PARAM_SINGLE(0, "ms_deform_attn_forward", "im2col_step",
-                             im2col_step);
-    GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-  }
-  cnrtDim3_t k_dims;
-  cnrtFunctionType_t k_type;
-  MsDeformAttnForwardPolicy policy = msDeformAttnForwardPolicyFunc(
-      handle, &k_dims, &k_type, batch_size, num_keys, num_heads, channels,
-      num_levels, num_queries, num_points);
-  switch (policy) {
-    default: {
-      VLOG(5) << "[mluOpMsDeformAttnForward] Policy not supported";
-      return MLUOP_STATUS_BAD_PARAM;
-    }; break;
-    case MS_DEFORM_ATTN_FORWARD_DEFAULT: {
-      switch (k_type) {
-        default: {
-          VLOG(5) << "Not Implemented";
-          break;
-        }
-        case CNRT_FUNC_TYPE_BLOCK: {
-          VLOG(5)
-              << "Launch Kernel MLUKernelMsDeformAttnForwardDefault<<<Block, "
-              << k_dims.x << ", " << k_dims.y << ", " << k_dims.z << ">>>";
-          KERNEL_CHECK(
-              (MLUKernelMsDeformAttnForwardDefault<float>
-               <<<k_dims, k_type, handle->queue>>>(
-                   (char *)data_value, (char *)data_spatial_shapes,
-                   (char *)data_level_start_index, (char *)data_sampling_loc,
-                   (char *)data_attn_weight, batch_size, num_keys, num_heads,
-                   channels, num_levels, num_queries, num_points,
-                   (char *)data_col)));
-          break;
-        }
-        case CNRT_FUNC_TYPE_UNION1: {
-          VLOG(5) << "Launch Kernel MLUKernelMsDeformAttnForwardDefault<<<Union"
-                  << k_type / CORE_DIM << ", " << k_dims.x << ", " << k_dims.y
-                  << ", " << k_dims.z << ">>>";
-          KERNEL_CHECK(
-              (MLUKernelMsDeformAttnForwardDefault<float>
-               <<<k_dims, k_type, handle->queue>>>(
-                   (char *)data_value, (char *)data_spatial_shapes,
-                   (char *)data_level_start_index, (char *)data_sampling_loc,
-                   (char *)data_attn_weight, batch_size, num_keys, num_heads,
-                   channels, num_levels, num_queries, num_points,
-                   (char *)data_col)));
-          break;
-        }
-      }
-      break;
-    }
-    case MS_DEFORM_ATTN_FORWARD_SMALL_CHANNEL: {
-      switch (k_type) {
-        default: {
-          VLOG(5) << "Not Implemented";
-          break;
-        }
-        case CNRT_FUNC_TYPE_BLOCK: {
-          VLOG(5) << "Launch Kernel "
-                     "MLUKernelMsDeformAttnForwardSmallChannel<<<Block, "
-                  << k_dims.x << ", " << k_dims.y << ", " << k_dims.z << ">>>";
-          KERNEL_CHECK(
-              (MLUKernelMsDeformAttnForwardSmallChannel<float>
-               <<<k_dims, k_type, handle->queue>>>(
-                   (char *)data_value, (char *)data_spatial_shapes,
-                   (char *)data_level_start_index, (char *)data_sampling_loc,
-                   (char *)data_attn_weight, batch_size, num_keys, num_heads,
-                   channels, num_levels, num_queries, num_points,
-                   (char *)data_col)));
-          break;
-        }
-        case CNRT_FUNC_TYPE_UNION1: {
-          VLOG(5) << "Launch Kernel "
-                     "MLUKernelMsDeformAttnForwardSmallChannel<<<Union"
-                  << k_type / CORE_DIM << ", " << k_dims.x << ", " << k_dims.y
-                  << ", " << k_dims.z << ">>>";
-          KERNEL_CHECK(
-              (MLUKernelMsDeformAttnForwardSmallChannel<float>
-               <<<k_dims, k_type, handle->queue>>>(
-                   (char *)data_value, (char *)data_spatial_shapes,
-                   (char *)data_level_start_index, (char *)data_sampling_loc,
-                   (char *)data_attn_weight, batch_size, num_keys, num_heads,
-                   channels, num_levels, num_queries, num_points,
-                   (char *)data_col)));
-          break;
-        }
-      }
-      break;
-    }
-    case MS_DEFORM_ATTN_FORWARD_FAST: {
-      VLOG(5) << "Launch Kernel MLUKernelMsDeformAttnForwardFast<<<Union"
-              << k_type / CORE_DIM << ", " << k_dims.x << ", " << k_dims.y
-              << ", " << k_dims.z << ">>>";
-      KERNEL_CHECK((MLUKernelMsDeformAttnForwardFast<float>
-                    <<<k_dims, k_type, handle->queue>>>(
-                        (char *)data_value, (char *)data_spatial_shapes,
-                        (char *)data_level_start_index,
-                        (char *)data_sampling_loc, (char *)data_attn_weight,
-                        batch_size, num_keys, num_heads, channels, num_levels,
-                        num_queries, num_points, (char *)data_col)));
-      break;
-    }
-  }
-  GEN_CASE_END();
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h b/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h
deleted file mode 100644
index 4e9360927..000000000
--- a/kernels/ms_deform_attn_forward/ms_deform_attn_utils.h
+++ /dev/null
@@ -1,398 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_UTILS_H_
-#define KERNELS_MS_DEFORM_ATTN_FORWARD_MS_DEFORM_ATTN_UTILS_H_
-
-#include <math.h>
-#include <algorithm>
-
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-
-#define BIT_COLLECT_PAD (8)
-#define BACKWARD_MAX_NQ_NL_NP (1024)
-
-#if (__BANG_ARCH__ >= 372)
-
-__mlu_func__ void broadcastSpatialHW(
-    float* spatial_offset_bd_nram,  // (num_levels, num_points)
-    float* spatial_h_bd_nram,       // (num_levels, num_points)
-    float* spatial_w_bd_nram,       // (num_levels, num_points)
-    int32_t* spatial_shapes_nram,   // (num_levels, 2)
-    int32_t* spatial_offset_nram,   // (num_levels)
-    const int32_t num_levels, const int32_t num_points) {
-  __bang_int322float((float*)spatial_shapes_nram, spatial_shapes_nram,
-                     num_levels * 2, 0);
-  __memcpy(spatial_h_bd_nram, spatial_shapes_nram, sizeof(float), NRAM2NRAM,
-           sizeof(float), num_points - 1, num_points * sizeof(float),
-           num_levels - 1, 0, num_points - 1, 2 * sizeof(float),
-           num_levels - 1);
-  __memcpy(spatial_w_bd_nram, (float*)spatial_shapes_nram + 1, sizeof(float),
-           NRAM2NRAM, sizeof(float), num_points - 1, num_points * sizeof(float),
-           num_levels - 1, 0, num_points - 1, 2 * sizeof(float),
-           num_levels - 1);
-  __bang_int322float((float*)spatial_offset_nram, spatial_offset_nram,
-                     num_levels, 0);
-  __memcpy(spatial_offset_bd_nram, spatial_offset_nram, sizeof(float),
-           NRAM2NRAM, sizeof(float), num_points - 1, num_points * sizeof(float),
-           num_levels - 1, 0, num_points - 1, sizeof(float), num_levels - 1);
-}
-
-template <typename T>
-__mlu_func__ void prepareLoopV2(
-    int32_t* seq_nram, T* zeros_nram, int32_t* spatial_offset_nram,
-    int32_t* spatial_hw_nram, int8_t* mask_x_nram, int8_t* mask_y_nram,
-    T* spatial_offset_bd_nram, T* spatial_h_bd_nram, T* spatial_w_bd_nram,
-    T* value_sram, const void* data_level_start_index_gdram,
-    const void* data_spatial_shapes_gdram, const int32_t num_keys,
-    const int32_t num_levels, const int32_t num_points,
-    const int32_t max_deal_n, const int32_t mask_size, const int32_t channels) {
-  if (seq_nram != nullptr) {
-    for (int i = 0; i < 8; i++) {
-      seq_nram[i] = i;
-    }
-    __bang_add_scalar(seq_nram + 8, seq_nram, 8, 8);     // [0, 7] + 8
-    __bang_add_scalar(seq_nram + 16, seq_nram, 16, 16);  // [0, 15] + 16
-    __bang_add_scalar(seq_nram + 32, seq_nram, 32, 32);  // [0, 31] + 32
-    __bang_add_scalar(seq_nram + 64, seq_nram, 64, 64);
-    __bang_add_scalar(seq_nram + 128, seq_nram, 128, 128);
-    __bang_add_scalar(seq_nram + 256, seq_nram, 256, 256);
-    __bang_add_scalar(seq_nram + 512, seq_nram, 512, 512);  // [0, 511] + 512
-  }
-  __bang_write_value(zeros_nram, channels, (T)0);
-  __bang_write_value(mask_x_nram, mask_size, (char)0x55);
-  __bang_write_value(mask_y_nram, mask_size, (char)0xAA);
-  __memcpy_async(spatial_offset_nram, data_level_start_index_gdram,
-                 num_levels * sizeof(int32_t), GDRAM2NRAM);
-  __memcpy_async(spatial_hw_nram, data_spatial_shapes_gdram,
-                 num_levels * 2 * sizeof(int32_t), GDRAM2NRAM);
-  __sync_io_move_compute();
-  broadcastSpatialHW(spatial_offset_bd_nram, spatial_h_bd_nram,
-                     spatial_w_bd_nram, spatial_hw_nram, spatial_offset_nram,
-                     num_levels, num_points);
-}
-
-/*
-  Split batch*head into taskDimY, the split num_queries into coreDim.
-  This plan is used to staying data_value on SRAM.
-*/
-__mlu_func__ void splitTaskV1(
-    int32_t& cluster_begin_batch_head, int32_t& cluster_act_batch_head,
-    int32_t& cluster_end_batch_head, int32_t& core_begin_query,
-    int32_t& core_act_query, int32_t& core_loop_num, int32_t& core_step_query,
-    const int32_t max_deal_n, const int32_t batch_size, const int32_t num_keys,
-    const int32_t num_heads, const int32_t channels, const int32_t num_levels,
-    const int32_t num_queries, const int32_t num_points) {
-  // split batch*head into taskDimY
-  int32_t batch_head = batch_size * num_heads;
-  int32_t cluster_avg_batch_head = (batch_head + taskDimY - 1) / taskDimY;
-  cluster_begin_batch_head = taskIdY * cluster_avg_batch_head;
-  cluster_act_batch_head =
-      std::min(cluster_avg_batch_head, batch_head - cluster_begin_batch_head);
-  cluster_end_batch_head = cluster_begin_batch_head + cluster_act_batch_head;
-  // split query into coreDim
-  int32_t core_avg_query = (num_queries + coreDim - 1) / coreDim;
-  core_begin_query = coreId * core_avg_query;
-  core_act_query = std::min(num_queries - core_begin_query, core_avg_query);
-  core_loop_num = (core_act_query + max_deal_n - 1) / max_deal_n;
-  core_step_query = core_loop_num > 0
-                        ? (core_act_query + core_loop_num - 1) / core_loop_num
-                        : 0;
-}
-
-/*
-  Split num_queries into taskDim.
-  Each core iterate in batch * head
-*/
-__mlu_func__ void splitTaskV2(
-    int32_t& cluster_begin_batch_head, int32_t& cluster_act_batch_head,
-    int32_t& cluster_end_batch_head, int32_t& core_begin_query,
-    int32_t& core_act_query, int32_t& core_loop_num, int32_t& core_step_query,
-    const int32_t max_deal_n, const int32_t batch_size, const int32_t num_keys,
-    const int32_t num_heads, const int32_t channels, const int32_t num_levels,
-    const int32_t num_queries, const int32_t num_points) {
-  // not split batch*head
-  int32_t batch_head = batch_size * num_heads;
-  cluster_begin_batch_head = 0;
-  cluster_act_batch_head = batch_head;
-  cluster_end_batch_head = batch_head;
-  // split query into taskDim
-  int32_t core_avg_query = (num_queries + taskDim - 1) / taskDim;
-  core_begin_query = taskId * core_avg_query;
-  core_act_query = std::min(num_queries - core_begin_query, core_avg_query);
-  core_loop_num = (core_act_query + max_deal_n - 1) / max_deal_n;
-  core_step_query = core_loop_num > 0
-                        ? (core_act_query + core_loop_num - 1) / core_loop_num
-                        : 0;
-}
-
-template <typename T>
-__mlu_func__ void computePolationWeightOffsetCond(
-    int32_t* data_offset_nram, T* weight_polation_nram,
-    T* cond_point_polation_nram, T* cond_point_valid_nram, T* loc_nram,
-    int8_t* mask_x_nram, int8_t* mask_y_nram, T* spatial_offset_bd_nram,
-    T* spatial_w_bd_nram, T* spatial_h_bd_nram, T* delata_xy_nram, T* buf_nram,
-    const bool cached_delta_xy, const int32_t deal_n, const int32_t num_levels,
-    const int32_t num_points, const int32_t num_heads, const int32_t channels) {
-  int32_t total_points = deal_n * num_levels * num_points;
-  int32_t block_points = num_levels * num_points;
-  T* buf_x_nram = buf_nram;
-  T* buf_y_nram = buf_nram + total_points;
-  T* buf_cond_nram = buf_nram + 2 * total_points;
-  T* buf_x_floor = buf_nram + 2 * total_points;
-  T* buf_x_ceil = buf_nram + 3 * total_points;
-  T* buf_y_floor = buf_nram + 4 * total_points;
-  T* buf_y_ceil = buf_nram + 5 * total_points;
-  //================================================================================================
-  int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD);
-  __bang_collect_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad);
-  __bang_collect_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad);
-  // x = loc_x * spatial_w - 0.5; y = loc_y * spatial_h - 0.5;
-  __bang_fusion(FUSION_FMS, buf_x_nram, buf_x_nram, spatial_w_bd_nram, (T)0.5,
-                total_points, block_points);
-  __bang_fusion(FUSION_FMS, buf_y_nram, buf_y_nram, spatial_h_bd_nram, (T)0.5,
-                total_points, block_points);
-  //================================================================================================
-  // get point condition. use buf0, buf1, buf2
-  // (x > -1 && y > -1 && y < spatial_h && x < spatial_w)
-  __bang_gt_scalar(cond_point_valid_nram, buf_x_nram, (T)-1.0, total_points);
-  __bang_gt_scalar(buf_cond_nram, buf_y_nram, (T)-1.0, total_points);
-  __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram,
-             total_points);
-  __bang_cycle_lt(buf_cond_nram, buf_x_nram, spatial_w_bd_nram, total_points,
-                  block_points);
-  __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram,
-             total_points);
-  __bang_cycle_lt(buf_cond_nram, buf_y_nram, spatial_h_bd_nram, total_points,
-                  block_points);
-  __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram,
-             total_points);
-  //================================================================================================
-  __bang_floor(buf_x_floor, buf_x_nram, total_points);
-  __bang_add_scalar(buf_x_ceil, buf_x_floor, 1.0, total_points);
-  __bang_floor(buf_y_floor, buf_y_nram, total_points);
-  __bang_add_scalar(buf_y_ceil, buf_y_floor, 1.0, total_points);
-  T* cond_point_polation_nram_tl = cond_point_polation_nram;
-  T* cond_point_polation_nram_bl = cond_point_polation_nram + total_points;
-  T* cond_point_polation_nram_tr = cond_point_polation_nram + 2 * total_points;
-  T* cond_point_polation_nram_br = cond_point_polation_nram + 3 * total_points;
-  T* cond_point_polation_nram_cond1 = weight_polation_nram;
-  T* cond_point_polation_nram_cond2 = weight_polation_nram + total_points;
-  T* cond_point_polation_nram_cond3 = weight_polation_nram + 2 * total_points;
-  T* cond_point_polation_nram_cond4 = weight_polation_nram + 3 * total_points;
-  __bang_ge_scalar(cond_point_polation_nram_cond1, buf_x_floor, (T)0,
-                   total_points);
-  __bang_cycle_lt(cond_point_polation_nram_cond2, buf_x_ceil, spatial_w_bd_nram,
-                  total_points, block_points);
-  __bang_ge_scalar(cond_point_polation_nram_cond3, buf_y_floor, (T)0,
-                   total_points);
-  __bang_cycle_lt(cond_point_polation_nram_cond4, buf_y_ceil, spatial_h_bd_nram,
-                  total_points, block_points);
-  __bang_and(cond_point_polation_nram_tl, cond_point_polation_nram_cond1,
-             cond_point_polation_nram_cond4, total_points);
-  __bang_and(cond_point_polation_nram_bl, cond_point_polation_nram_cond1,
-             cond_point_polation_nram_cond3, total_points);
-  __bang_and(cond_point_polation_nram_tr, cond_point_polation_nram_cond2,
-             cond_point_polation_nram_cond4, total_points);
-  __bang_and(cond_point_polation_nram_br, cond_point_polation_nram_cond2,
-             cond_point_polation_nram_cond3, total_points);
-  //================================================================================================
-  // get polation weight.
-  T* buf_dx = (T*)data_offset_nram;
-  T* buf_dy = buf_dx + total_points;
-  T* buf_dx_1 = buf_dy + total_points;
-  T* buf_dy_1 = buf_dx_1 + total_points;
-  T* weight_polation_nram_1 = weight_polation_nram;
-  T* weight_polation_nram_2 = weight_polation_nram + 1 * total_points;
-  T* weight_polation_nram_3 = weight_polation_nram + 2 * total_points;
-  T* weight_polation_nram_4 = weight_polation_nram + 3 * total_points;
-  // T* weight_polation_nram_buf = buf_nram + 4 * total_points;
-  __bang_sub(buf_dx, buf_x_floor, buf_x_nram, total_points);  // -dx
-  __bang_sub(buf_dy, buf_y_floor, buf_y_nram, total_points);  // -dy
-  __bang_fusion(FUSION_FSS, buf_dx_1, buf_x_nram, buf_x_floor,
-                (T)1.0,  // dx - 1
-                total_points, total_points);
-  __bang_fusion(FUSION_FSS, buf_dy_1, buf_y_nram, buf_y_floor,
-                (T)1.0,  // dy - 1
-                total_points, total_points);
-  __bang_mul(weight_polation_nram_1, buf_dx_1, buf_dy,
-             total_points);  // (-dy)(dx-1)
-  __bang_mul(weight_polation_nram_2, buf_dx_1, buf_dy_1,
-             total_points);  // (dx-1)*(dy-1)
-  __bang_mul(weight_polation_nram_3, buf_dx, buf_dy,
-             total_points);  // (-dx)*(-dy)
-  __bang_mul(weight_polation_nram_4, buf_dx, buf_dy_1,
-             total_points);  // (-dx)*(dy-1)
-  if (cached_delta_xy) {
-    __bang_sub(delata_xy_nram, buf_x_nram, buf_x_floor, total_points);  // dx
-    __bang_add_scalar(delata_xy_nram + total_points, buf_dx, 1,
-                      total_points);  // 1-dx
-    __bang_sub(delata_xy_nram + 2 * total_points, buf_y_nram, buf_y_floor,
-               total_points);  // dy
-    __bang_add_scalar(delata_xy_nram + 3 * total_points, buf_dy, 1,
-                      total_points);  // 1-dy
-  }
-  //================================================================================================
-  // correct the x,y in [0, w-1] and [0, h-1]
-  T* spatial_w1_bd_nram = buf_nram;
-  T* spatial_h1_bd_nram = buf_nram + block_points;
-  __bang_sub_scalar(spatial_w1_bd_nram, spatial_w_bd_nram, (T)1, block_points);
-  __bang_sub_scalar(spatial_h1_bd_nram, spatial_h_bd_nram, (T)1, block_points);
-  __bang_maxeq_scalar(buf_x_floor, buf_x_floor, (T)0, total_points);
-  __bang_maxeq_scalar(buf_x_ceil, buf_x_ceil, (T)0, total_points);
-  __bang_cycle_minequal(buf_x_floor, buf_x_floor, spatial_w1_bd_nram,
-                        total_points, block_points);
-  __bang_cycle_minequal(buf_x_ceil, buf_x_ceil, spatial_w1_bd_nram,
-                        total_points, block_points);
-  __bang_maxeq_scalar(buf_y_floor, buf_y_floor, (T)0, total_points);
-  __bang_maxeq_scalar(buf_y_ceil, buf_y_ceil, (T)0, total_points);
-  __bang_cycle_minequal(buf_y_floor, buf_y_floor, spatial_h1_bd_nram,
-                        total_points, block_points);
-  __bang_cycle_minequal(buf_y_ceil, buf_y_ceil, spatial_h1_bd_nram,
-                        total_points, block_points);
-  //================================================================================================
-  // offset = y*w + x
-  T* buf_hw_offset = buf_nram;
-  T* data_offset_nram_tl = (T*)data_offset_nram;
-  T* data_offset_nram_bl = data_offset_nram_tl + total_points;
-  T* data_offset_nram_tr = data_offset_nram_bl + total_points;
-  T* data_offset_nram_br = data_offset_nram_tr + total_points;
-  // y_ceil*w + offset + x_floor
-  __bang_fusion(FUSION_FMA, buf_hw_offset, buf_y_ceil, spatial_w_bd_nram,
-                spatial_offset_bd_nram, total_points, block_points);
-  __bang_add(data_offset_nram_tl, buf_hw_offset, buf_x_floor, total_points);
-  // y_ceil*w + offset + x_ceil
-  __bang_add(data_offset_nram_tr, buf_hw_offset, buf_x_ceil, total_points);
-  // y_floor*w + offset + x_foor
-  __bang_fusion(FUSION_FMA, buf_hw_offset, buf_y_floor, spatial_w_bd_nram,
-                spatial_offset_bd_nram, total_points, block_points);
-  __bang_add(data_offset_nram_bl, buf_hw_offset, buf_x_floor, total_points);
-  // y_floor*w + offset + x_ceil
-  __bang_add(data_offset_nram_br, buf_hw_offset, buf_x_ceil, total_points);
-  __bang_float2int32(data_offset_nram, (T*)data_offset_nram, total_points * 4,
-                     0);
-  int32_t stride = num_heads * channels * sizeof(T);
-  __bang_mul_scalar(data_offset_nram, data_offset_nram, stride,
-                    total_points * 4);
-  //================================================================================================
-  // merge conditions and clear weight, cast conditions to bits
-  T* cond_point_polation_nram_tmp = buf_nram;
-  __bang_cycle_and(cond_point_polation_nram, cond_point_polation_nram,
-                   cond_point_valid_nram, 4 * total_points, total_points);
-  __bang_float2int32((int32_t*)cond_point_polation_nram_tmp,
-                     cond_point_polation_nram, total_points * 4, 0);
-  __bang_mul_scalar((int32_t*)cond_point_polation_nram_tmp,
-                    (int32_t*)cond_point_polation_nram_tmp, (int32_t)0xffffffff,
-                    total_points * 4);
-  __bang_band((char*)weight_polation_nram, (char*)weight_polation_nram,
-              (char*)cond_point_polation_nram_tmp,
-              total_points * 4 * sizeof(float));
-}
-
-/*
-  compute condition, polation_weight, offset and store to SRAM.
-  cache_delta_xy and cache_point_valid is true in backward, false in forward.
-*/
-template <typename T>
-__mlu_func__ void stageOneLoop(
-    T* sampling_loc_gdram, T* weight_attn_gdram, int32_t* data_offset_nram,
-    void* delata_xy_nram, T* weight_polation_nram, T* cond_point_polation_nram,
-    T* cond_point_valid_nram, T* loc_nram, T* buf_nram, T* buf_nram_end,
-    int8_t* mask_x_nram, int8_t* mask_y_nram, T* spatial_offset_bd_nram,
-    T* spatial_w_bd_nram, T* spatial_h_bd_nram, int32_t* spatial_offset_nram,
-    int32_t* spatial_hw_nram, int32_t* data_offset_sram, void* delta_xy_sram,
-    T* weight_polation_sram, T* weight_attn_sram, T* cond_point_polation_sram,
-    const bool cache_delta_xy, const bool cache_point_valid,
-    const int32_t total_deal_n, const int32_t max_deal_n,
-    const int32_t num_heads, const int32_t channels, const int32_t num_levels,
-    const int32_t num_points, const int32_t input_stride_2,
-    const int32_t input_stride_3) {
-  int32_t loop_num = (total_deal_n + max_deal_n - 1) / max_deal_n;
-  int32_t num_levels_points = num_levels * num_points;
-  int32_t sram_offset = 0;
-  int32_t sram_dst_stride = total_deal_n * num_levels_points * sizeof(T);
-  for (int i = 0; i < loop_num; i++) {
-    int32_t deal_n = std::min(total_deal_n - i * max_deal_n, max_deal_n);
-    int32_t deal_point_num = deal_n * num_levels_points;
-    int32_t copy_size = deal_point_num * sizeof(T);
-    __memcpy(loc_nram, sampling_loc_gdram + i * max_deal_n * input_stride_3 * 2,
-             input_stride_2 * 2 * sizeof(T), GDRAM2NRAM,
-             input_stride_2 * 2 * sizeof(T), input_stride_3 * 2 * sizeof(T),
-             deal_n - 1);
-    computePolationWeightOffsetCond(
-        data_offset_nram, weight_polation_nram, cond_point_polation_nram,
-        cond_point_valid_nram, loc_nram, mask_x_nram, mask_y_nram,
-        spatial_offset_bd_nram, spatial_w_bd_nram, spatial_h_bd_nram,
-        (T*)delata_xy_nram, buf_nram, cache_delta_xy, deal_n, num_levels,
-        num_points, num_heads, channels);
-    __memcpy(data_offset_sram + sram_offset, data_offset_nram, copy_size,
-             NRAM2SRAM, sram_dst_stride, copy_size, 3);
-    __memcpy(weight_polation_sram + sram_offset, weight_polation_nram,
-             copy_size, NRAM2SRAM, sram_dst_stride, copy_size, 3);
-    __memcpy(cond_point_polation_sram + sram_offset, cond_point_polation_nram,
-             copy_size, NRAM2SRAM, sram_dst_stride, copy_size, 3);
-    if (cache_point_valid) {
-      __memcpy(cond_point_polation_sram + 4 * total_deal_n * num_levels_points +
-                   sram_offset,
-               cond_point_valid_nram, copy_size, NRAM2SRAM);
-    }
-    if (cache_delta_xy) {
-      __memcpy((T*)delta_xy_sram + sram_offset, delata_xy_nram, copy_size,
-               NRAM2SRAM, sram_dst_stride, copy_size, 3);
-    }
-    __memcpy(buf_nram, weight_attn_gdram + i * max_deal_n * input_stride_3,
-             input_stride_2 * sizeof(T), GDRAM2NRAM, input_stride_2 * sizeof(T),
-             input_stride_3 * sizeof(T), deal_n - 1);
-    __bang_float2int32((int32_t*)cond_point_valid_nram, cond_point_valid_nram,
-                       deal_point_num, 0);
-    __bang_mul_scalar((int32_t*)cond_point_valid_nram,
-                      (int32_t*)cond_point_valid_nram, (int32_t)0xffffffff,
-                      deal_point_num);
-    __bang_band((char*)buf_nram, (char*)buf_nram, (char*)cond_point_valid_nram,
-                deal_n * num_levels * num_points * sizeof(T));
-    __memcpy(weight_attn_sram + sram_offset, buf_nram, copy_size, NRAM2SRAM);
-    sram_offset += deal_point_num;
-  }
-  __sync_io_move_compute();
-}
-#endif
-
-#if (__BANG_ARCH__ == 592)
-__mlu_func__ void gatherAsync(void* dst, void* src, unsigned int* offset,
-                              void* mask, int transfer_size,
-                              mluMemcpyDirection_t dir, int dst_stride,
-                              int transfer_num) {
-  __gather_async(dst, src, offset, mask, transfer_size, dir, dst_stride,
-                 transfer_num);
-}
-
-__mlu_func__ void gatherSync(void* dst, void* src, unsigned int* offset,
-                             void* mask, int transfer_size,
-                             mluMemcpyDirection_t dir, int dst_stride,
-                             int transfer_num) {
-  __gather(dst, src, offset, mask, transfer_size, dir, dst_stride,
-           transfer_num);
-}
-#endif
-
-#endif
diff --git a/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu b/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu
deleted file mode 100644
index 8397bb276..000000000
--- a/kernels/ms_deform_attn_forward/msda_forward_fast_union1.mlu
+++ /dev/null
@@ -1,1280 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "ms_deform_attn_utils.h"
-
-#pragma bang walign(64)
-
-#if (__BANG_ARCH__ >= 372)
-
-#define MAX_MEMCPY_SEGNUM (65536)
-#define NRAM_REMAIN_SIZE (48 * 1024)
-#define SRAM_REMAIN_SIZE (32 * 1024)
-#define NRAM_AVALIABLE_SIZE (__MLU_NRAM_SIZE__ * 1024 - NRAM_REMAIN_SIZE)
-#define WRAM_AVALIABLE_SIZE (__MLU_WRAM_SIZE__ * 1024)
-#define SRAM_AVALIABLE_SIZE (__MLU_SRAM_SIZE__ * 1024 - SRAM_REMAIN_SIZE)
-#define SRAM_FOR_VALUE_SIZE (SRAM_AVALIABLE_SIZE - 128)
-
-#ifndef LT_NUM
-#define LT_NUM 64
-#endif
-
-#ifndef WRAM_LT_STRIDE
-#define WRAM_LT_STRIDE (__MLU_WRAM_SIZE__ * 1024 / LT_NUM)
-#endif
-
-#ifndef WRAM_ALIGN_SIZE
-#define WRAM_ALIGN_SIZE (64)
-#endif
-
-__nram__ char nram_buffer[NRAM_AVALIABLE_SIZE];
-__mlu_shared__ char sram_buffer[SRAM_AVALIABLE_SIZE];
-__wram__ char wram_buffer[WRAM_AVALIABLE_SIZE];
-
-template <typename T>
-__mlu_func__ void tileWeight2WramAsync(T* dst,
-                                       T* src,  // (co, ci)
-                                       int32_t co, int32_t ci, int32_t pad_co,
-                                       int32_t pad_ci) {
-  int32_t co_num = co / LT_NUM;
-  int32_t co_remain = co % LT_NUM;
-  if (co_num > 0) {
-    __memcpy_async(dst, src, ci * sizeof(T), NRAM2WRAM, WRAM_LT_STRIDE,
-                   LT_NUM - 1, pad_ci * sizeof(T), co_num - 1, ci * sizeof(T),
-                   LT_NUM - 1, LT_NUM * ci * sizeof(T), co_num - 1);
-  }
-  if (co_remain > 0) {
-    __memcpy_async(dst + co_num * pad_ci, src + co_num * LT_NUM * ci,
-                   ci * sizeof(T), NRAM2WRAM, WRAM_LT_STRIDE, ci * sizeof(T),
-                   co_remain - 1);
-  }
-}
-
-template <typename T>
-__mlu_func__ void tileWeight2WramSync(T* dst,
-                                      T* src,  // (co, ci)
-                                      int32_t co, int32_t ci, int32_t pad_co,
-                                      int32_t pad_ci) {
-  int32_t co_num = co / LT_NUM;
-  int32_t co_remain = co % LT_NUM;
-  if (co_num > 0) {
-    __memcpy(dst, src, ci * sizeof(T), NRAM2WRAM, WRAM_LT_STRIDE, LT_NUM - 1,
-             pad_ci * sizeof(T), co_num - 1, ci * sizeof(T), LT_NUM - 1,
-             LT_NUM * ci * sizeof(T), co_num - 1);
-  }
-  if (co_remain > 0) {
-    __memcpy(dst + co_num * pad_ci, src + co_num * LT_NUM * ci, ci * sizeof(T),
-             NRAM2WRAM, WRAM_LT_STRIDE, ci * sizeof(T), co_remain - 1);
-  }
-}
-
-template <typename T>
-__mlu_func__ void isValueContainInfNan(T* input_sram, T* output_sram,
-                                       T* nram_buf, bool& value_contain_infnan,
-                                       int32_t nram_buf_size,
-                                       int32_t data_num) {
-  int32_t core_avg_num = (data_num + coreDim - 1) / coreDim;
-  int32_t core_begin_num = core_avg_num * coreId;
-  int32_t core_act_num = __mluop_min(data_num - core_begin_num, core_avg_num);
-  int32_t core_step_num =
-      PAD_DOWN(nram_buf_size - NFU_ALIGN_SIZE, NFU_ALIGN_SIZE) / sizeof(T);
-  int32_t c = NFU_ALIGN_SIZE / sizeof(T);
-  int32_t loop_num = (core_act_num + core_step_num - 1) / core_step_num;
-  int32_t remain_num = (int)(loop_num > 0) * (core_act_num % core_step_num);
-  T* input_sram_base = input_sram + core_begin_num;
-  T* nram_out = nram_buf;
-  T* nram_input = nram_buf + NFU_ALIGN_SIZE / sizeof(T);
-  T sum = 0;
-
-  if (remain_num > 0) {
-    int32_t n = (remain_num + c - 1) / c;
-    __bang_write_value(nram_input + (n - 1) * c, c, (T)0);
-  }
-
-  for (int32_t i = 0; i < loop_num; i++) {
-    int32_t deal_num =
-        __mluop_min(core_step_num, core_act_num - i * core_step_num);
-    int32_t n = (deal_num + c - 1) / c;
-    __memcpy(nram_input, input_sram_base + i * core_step_num,
-             deal_num * sizeof(T), SRAM2NRAM);
-    __bang_sumpool(nram_out, nram_input, c, n, 1, n, 1, 1, 1);
-    __bang_sumpool(nram_input, nram_out, 1, c, 1, c, 1, 1, 1);
-    T tmp = nram_input[0];
-    if (isnan(tmp) || isinf(tmp)) {
-      sum = 1;
-      break;
-    } else {
-      sum = 0;
-    }
-  }
-
-  output_sram[coreId] = sum;
-  __sync_all_ipu_within_cluster();
-  __memcpy(nram_input, output_sram, coreDim * sizeof(T), SRAM2NRAM);
-  value_contain_infnan =
-      (nram_input[0] + nram_input[1] + nram_input[2] + nram_input[3]) > 0;
-}
-
-template <typename T, bool SRAM_STAY>
-__mlu_func__ void getConditionCoordWeight(
-    int32_t* data_offset_nram, T* weight_polation_nram,
-    T* cond_point_polation_nram, T* cond_point_valid_nram, T* loc_nram,
-    T* weight_attn_nram, int8_t* mask_x_nram, int8_t* mask_y_nram,
-    T* spatial_offset_bd_nram, T* spatial_w_bd_nram, T* spatial_h_bd_nram,
-    T* buf_nram, bool& w_contain_inf, const bool value_contain_infnan,
-    const int32_t deal_n, const int32_t num_levels, const int32_t num_points,
-    const int32_t num_heads, const int32_t channels) {
-  int32_t total_points = deal_n * num_levels * num_points;
-  int32_t block_points = num_levels * num_points;
-  T* buf_x_nram = buf_nram;
-  T* buf_y_nram = buf_nram + total_points;
-  T* buf_cond_nram = buf_nram + 2 * total_points;
-  T* buf_x_floor = buf_nram + 2 * total_points;
-  T* buf_x_ceil = buf_nram + 3 * total_points;
-  T* buf_y_floor = buf_nram + 4 * total_points;
-  T* buf_y_ceil = buf_nram + 5 * total_points;
-  //================================================================================================
-  // if weight_attn_nram contain inf
-  int32_t inf_p = 0x7f7fffff;
-  int32_t inf_n = 0xff7fffff;
-  T inf_p_f = *((T*)&inf_p);
-  T inf_n_f = *((T*)&inf_n);
-  __bang_lt_scalar(buf_nram, weight_attn_nram, inf_n_f, total_points);
-  __bang_gt_scalar(buf_nram + total_points, weight_attn_nram, inf_p_f,
-                   total_points);
-  __bang_sumpool(buf_nram + 2 * total_points, buf_nram, 1, 2 * total_points, 1,
-                 2 * total_points, 1, 1, 1);
-  w_contain_inf = buf_nram[2 * total_points] > 0;
-  //================================================================================================
-  int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD);
-  __bang_collect_bitindex(buf_x_nram, loc_nram, mask_x_nram, total_coord_pad);
-  __bang_collect_bitindex(buf_y_nram, loc_nram, mask_y_nram, total_coord_pad);
-  // x = loc_x * spatial_w - 0.5; y = loc_y * spatial_h - 0.5;
-  __bang_fusion(FUSION_FMS, buf_x_nram, buf_x_nram, spatial_w_bd_nram, (T)0.5,
-                total_points, block_points);
-  __bang_fusion(FUSION_FMS, buf_y_nram, buf_y_nram, spatial_h_bd_nram, (T)0.5,
-                total_points, block_points);
-  //================================================================================================
-  // get point condition. use buf0, buf1, buf2
-  // (x > -1 && y > -1 && y < spatial_h && x < spatial_w)
-  __bang_gt_scalar(cond_point_valid_nram, buf_x_nram, (T)-1.0, total_points);
-  __bang_gt_scalar(buf_cond_nram, buf_y_nram, (T)-1.0, total_points);
-  __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram,
-             total_points);
-  __bang_cycle_lt(buf_cond_nram, buf_x_nram, spatial_w_bd_nram, total_points,
-                  block_points);
-  __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram,
-             total_points);
-  __bang_cycle_lt(buf_cond_nram, buf_y_nram, spatial_h_bd_nram, total_points,
-                  block_points);
-  __bang_and(cond_point_valid_nram, cond_point_valid_nram, buf_cond_nram,
-             total_points);
-  //================================================================================================
-  __bang_floor(buf_x_floor, buf_x_nram, total_points);
-  __bang_add_scalar(buf_x_ceil, buf_x_floor, 1.0, total_points);
-  __bang_floor(buf_y_floor, buf_y_nram, total_points);
-  __bang_add_scalar(buf_y_ceil, buf_y_floor, 1.0, total_points);
-  T* cond_point_polation_nram_tl = cond_point_polation_nram;
-  T* cond_point_polation_nram_bl = cond_point_polation_nram + total_points;
-  T* cond_point_polation_nram_tr = cond_point_polation_nram + 2 * total_points;
-  T* cond_point_polation_nram_br = cond_point_polation_nram + 3 * total_points;
-  T* cond_point_polation_nram_cond1 = weight_polation_nram;
-  T* cond_point_polation_nram_cond2 = weight_polation_nram + total_points;
-  T* cond_point_polation_nram_cond3 = weight_polation_nram + 2 * total_points;
-  T* cond_point_polation_nram_cond4 = weight_polation_nram + 3 * total_points;
-  __bang_ge_scalar(cond_point_polation_nram_cond1, buf_x_floor, (T)0,
-                   total_points);
-  __bang_cycle_lt(cond_point_polation_nram_cond2, buf_x_ceil, spatial_w_bd_nram,
-                  total_points, block_points);
-  __bang_ge_scalar(cond_point_polation_nram_cond3, buf_y_floor, (T)0,
-                   total_points);
-  __bang_cycle_lt(cond_point_polation_nram_cond4, buf_y_ceil, spatial_h_bd_nram,
-                  total_points, block_points);
-  __bang_and(cond_point_polation_nram_tl, cond_point_polation_nram_cond1,
-             cond_point_polation_nram_cond4, total_points);
-  __bang_and(cond_point_polation_nram_bl, cond_point_polation_nram_cond1,
-             cond_point_polation_nram_cond3, total_points);
-  __bang_and(cond_point_polation_nram_tr, cond_point_polation_nram_cond2,
-             cond_point_polation_nram_cond4, total_points);
-  __bang_and(cond_point_polation_nram_br, cond_point_polation_nram_cond2,
-             cond_point_polation_nram_cond3, total_points);
-  //================================================================================================
-  // get polation weight.
-  T* buf_dx = (T*)data_offset_nram;
-  T* buf_dy = buf_dx + total_points;
-  T* buf_dx_1 = buf_dy + total_points;
-  T* buf_dy_1 = buf_dx_1 + total_points;
-  // -dx = x_floor-x
-  // -dy = y_floor-y
-  // w1 = (1-dx)*dy     = (dx-1)*(-dy)
-  // w2 = (1-dx)*(1-dy) = (dx-1)*(dy-1)
-  // w3 = dx*dy         = (-dx)*(-dy)
-  // w4 = dx*(1-dy)     = (-dx)*(dy-1)
-  T* weight_polation_nram_1 = weight_polation_nram;
-  T* weight_polation_nram_2 = weight_polation_nram + 1 * total_points;
-  T* weight_polation_nram_3 = weight_polation_nram + 2 * total_points;
-  T* weight_polation_nram_4 = weight_polation_nram + 3 * total_points;
-  // T* weight_polation_nram_buf = buf_nram + 4 * total_points;
-  __bang_sub(buf_dx, buf_x_floor, buf_x_nram, total_points);
-  __bang_sub(buf_dy, buf_y_floor, buf_y_nram, total_points);
-  __bang_fusion(FUSION_FSS, buf_dx_1, buf_x_nram, buf_x_floor, (T)1.0,
-                total_points, total_points);
-  __bang_fusion(FUSION_FSS, buf_dy_1, buf_y_nram, buf_y_floor, (T)1.0,
-                total_points, total_points);
-  __bang_mul(weight_polation_nram_1, buf_dx_1, buf_dy, total_points);
-  __bang_mul(weight_polation_nram_2, buf_dx_1, buf_dy_1, total_points);
-  __bang_mul(weight_polation_nram_3, buf_dx, buf_dy, total_points);
-  __bang_mul(weight_polation_nram_4, buf_dx, buf_dy_1, total_points);
-  //================================================================================================
-  // correct the x,y in [0, w-1] and [0, h-1]
-  T* spatial_w1_bd_nram = buf_nram;
-  T* spatial_h1_bd_nram = buf_nram + block_points;
-  __bang_sub_scalar(spatial_w1_bd_nram, spatial_w_bd_nram, (T)1, block_points);
-  __bang_sub_scalar(spatial_h1_bd_nram, spatial_h_bd_nram, (T)1, block_points);
-  __bang_maxeq_scalar(buf_x_floor, buf_x_floor, (T)0, total_points);
-  __bang_maxeq_scalar(buf_x_ceil, buf_x_ceil, (T)0, total_points);
-  __bang_cycle_minequal(buf_x_floor, buf_x_floor, spatial_w1_bd_nram,
-                        total_points, block_points);
-  __bang_cycle_minequal(buf_x_ceil, buf_x_ceil, spatial_w1_bd_nram,
-                        total_points, block_points);
-  __bang_maxeq_scalar(buf_y_floor, buf_y_floor, (T)0, total_points);
-  __bang_maxeq_scalar(buf_y_ceil, buf_y_ceil, (T)0, total_points);
-  __bang_cycle_minequal(buf_y_floor, buf_y_floor, spatial_h1_bd_nram,
-                        total_points, block_points);
-  __bang_cycle_minequal(buf_y_ceil, buf_y_ceil, spatial_h1_bd_nram,
-                        total_points, block_points);
-  //================================================================================================
-  // offset = y*w + x
-  T* buf_hw_offset = buf_nram;
-  T* data_offset_nram_tl = (T*)data_offset_nram;
-  T* data_offset_nram_bl = data_offset_nram_tl + total_points;
-  T* data_offset_nram_tr = data_offset_nram_bl + total_points;
-  T* data_offset_nram_br = data_offset_nram_tr + total_points;
-  // y_ceil*w + offset + x_floor
-  __bang_fusion(FUSION_FMA, buf_hw_offset, buf_y_ceil, spatial_w_bd_nram,
-                spatial_offset_bd_nram, total_points, block_points);
-  __bang_add(data_offset_nram_tl, buf_hw_offset, buf_x_floor, total_points);
-  // y_ceil*w + offset + x_ceil
-  __bang_add(data_offset_nram_tr, buf_hw_offset, buf_x_ceil, total_points);
-  // y_floor*w + offset + x_foor
-  __bang_fusion(FUSION_FMA, buf_hw_offset, buf_y_floor, spatial_w_bd_nram,
-                spatial_offset_bd_nram, total_points, block_points);
-  __bang_add(data_offset_nram_bl, buf_hw_offset, buf_x_floor, total_points);
-  // y_floor*w + offset + x_ceil
-  __bang_add(data_offset_nram_br, buf_hw_offset, buf_x_ceil, total_points);
-  //================================================================================================
-  // merge and select conditions and weight
-  T* weight_polation_nram_tmp = (T*)buf_nram;
-  __bang_cycle_and(cond_point_polation_nram, cond_point_polation_nram,
-                   cond_point_valid_nram, 4 * total_points, total_points);
-  if (!w_contain_inf) {
-    __bang_cycle_mul(weight_polation_nram, weight_polation_nram,
-                     weight_attn_nram, 4 * total_points, total_points);
-  }
-  __bang_mul_scalar(buf_nram, weight_attn_nram, (T)1, total_points);
-  __bang_collect((float*)weight_attn_nram, (float*)buf_nram,
-                 cond_point_valid_nram, total_points);
-  __bang_float2int32((int32_t*)cond_point_polation_nram,
-                     cond_point_polation_nram, total_points * 4, 0);
-  __bang_mul_scalar((int32_t*)cond_point_polation_nram,
-                    (int32_t*)cond_point_polation_nram, (int32_t)0xffffffff,
-                    total_points * 4);
-  __bang_band((char*)weight_polation_nram_tmp, (char*)weight_polation_nram,
-              (char*)cond_point_polation_nram,
-              total_points * 4 * sizeof(float));
-  __bang_collect((float*)weight_polation_nram, (float*)weight_polation_nram_tmp,
-                 cond_point_valid_nram, total_points);
-  __bang_collect((float*)weight_polation_nram + total_points,
-                 (float*)weight_polation_nram_tmp + total_points,
-                 cond_point_valid_nram, total_points);
-  __bang_collect((float*)weight_polation_nram + 2 * total_points,
-                 (float*)weight_polation_nram_tmp + 2 * total_points,
-                 cond_point_valid_nram, total_points);
-  __bang_collect((float*)weight_polation_nram + 3 * total_points,
-                 (float*)weight_polation_nram_tmp + 3 * total_points,
-                 cond_point_valid_nram, total_points);
-  //================================================================================================
-  // select cond_point_polation_nram if value_contain_infnan
-  if (value_contain_infnan) {
-    int32_t* cond_point_polation_nram_tmp = (int32_t*)buf_nram;
-    __bang_mul_scalar((int32_t*)cond_point_polation_nram_tmp,
-                      (int32_t*)cond_point_polation_nram, (int32_t)1,
-                      total_points * 4);
-    __bang_collect((float*)cond_point_polation_nram,
-                   (float*)cond_point_polation_nram_tmp, cond_point_valid_nram,
-                   total_points);
-    __bang_collect((float*)cond_point_polation_nram + total_points,
-                   (float*)cond_point_polation_nram_tmp + total_points,
-                   cond_point_valid_nram, total_points);
-    __bang_collect((float*)cond_point_polation_nram + 2 * total_points,
-                   (float*)cond_point_polation_nram_tmp + 2 * total_points,
-                   cond_point_valid_nram, total_points);
-    __bang_collect((float*)cond_point_polation_nram + 3 * total_points,
-                   (float*)cond_point_polation_nram_tmp + 3 * total_points,
-                   cond_point_valid_nram, total_points);
-  }
-  //================================================================================================
-  // compute and select offset and stride
-  int32_t* data_offset_nram_tl_tmp = (int32_t*)buf_nram;
-  int32_t* data_offset_nram_bl_tmp = data_offset_nram_tl_tmp + total_points;
-  int32_t* data_offset_nram_tr_tmp = data_offset_nram_bl_tmp + total_points;
-  __bang_float2int32(data_offset_nram_tl_tmp, data_offset_nram_tl,
-                     total_points * 4, 0);
-  int32_t stride =
-      SRAM_STAY ? channels * sizeof(T) : num_heads * channels * sizeof(T);
-  __bang_mul_scalar(data_offset_nram_tl_tmp, data_offset_nram_tl_tmp, stride,
-                    total_points * 4);
-  __bang_sub((int32_t*)data_offset_nram_bl_tmp,
-             (int32_t*)data_offset_nram_bl_tmp,
-             (int32_t*)data_offset_nram_tl_tmp, total_points);
-  __bang_sub((int32_t*)data_offset_nram_tr_tmp,
-             (int32_t*)data_offset_nram_tr_tmp,
-             (int32_t*)data_offset_nram_tl_tmp, total_points);
-  __bang_collect((float*)data_offset_nram_tl, (float*)data_offset_nram_tl_tmp,
-                 cond_point_valid_nram, total_points);
-  __bang_collect((float*)data_offset_nram_bl, (float*)data_offset_nram_bl_tmp,
-                 cond_point_valid_nram, total_points);
-  __bang_collect((float*)data_offset_nram_tr, (float*)data_offset_nram_tr_tmp,
-                 cond_point_valid_nram, total_points);
-}
-
-/*
-  shape of each tensor:
-  output_nram:          (channels)
-  input_nram:           (4, valid_num, channels)
-  input_trans:          (channels, 4, valid_num)
-  input_pooled:         (channels, valid_num)
-  cond_selected_base:   (4, deal_n, num_levels, num_points)
-  weight_selected_base: (4, deal_n, num_levels, num_points)
-  weight_attn_nram:     (valid_num)
-  weight_compute:       (4, valid_num)
-  cond_compute:         (4, valid_num)
-  input_wram:           (channels, 4 * valid_num)
-
-  valid_num <= num_levels * num_points
-  sample_stride_3 = deal_n * num_levels * num_points
-
-  Note:
-  If w_contain_inf is true, cannot merge attn_w and polation_w, so use sumpool
-  twice. If w_contain_inf is false, merge attn_w and polation_w and use matmul
-  instead. If value_contain_infnan is true, fill data_value of invalid
-  neighbors with 0.
-*/
-template <typename T>
-__mlu_func__ void reduceLevelByConv(
-    T* output_nram, T* input_nram, T* input_trans, T* input_pooled,
-    int32_t* cond_selected_base, T* weight_selected_base, T* weight_attn_nram,
-    T* weight_compute, int32_t* cond_compute, T* input_wram,
-    const int32_t valid_num, const int32_t channels,
-    const int32_t sample_stride_3, const bool w_contain_inf,
-    const bool value_contain_infnan) {
-  if (valid_num > 0) {
-    int32_t ci = 4 * valid_num;
-    int32_t pad_ci = PAD_UP(ci, WRAM_ALIGN_SIZE / sizeof(T));
-    int32_t co = channels;
-    int32_t pad_co = PAD_UP(co, LT_NUM);
-    if (value_contain_infnan) {
-      __memcpy_async(cond_compute, cond_selected_base,
-                     valid_num * sizeof(int32_t), NRAM2NRAM,
-                     valid_num * sizeof(T), sample_stride_3 * sizeof(T), 3);
-    }
-    __memcpy_async(weight_compute, weight_selected_base, valid_num * sizeof(T),
-                   NRAM2NRAM, valid_num * sizeof(T),
-                   sample_stride_3 * sizeof(T), 3);
-    __bang_transpose(input_trans, input_nram, ci, co);
-    __sync_move();
-
-    if (value_contain_infnan) {
-      __bang_cycle_band((char*)input_trans, (char*)input_trans,
-                        (char*)cond_compute, co * ci * sizeof(T),
-                        ci * sizeof(T));
-    }
-
-    if (w_contain_inf) {
-      __bang_cycle_mul(input_trans, input_trans, weight_compute, co * ci, ci);
-      __bang_sumpool(input_pooled, input_trans, valid_num, channels, 4, 1, 4, 1,
-                     1);
-      __bang_cycle_mul(input_pooled, input_pooled, weight_attn_nram,
-                       channels * valid_num, valid_num);
-      __bang_sumpool(output_nram, input_pooled, 1, channels, valid_num, 1,
-                     valid_num, 1, 1);
-    } else {
-      tileWeight2WramSync(input_wram, input_trans, co, ci, pad_co, pad_ci);
-      __bang_conv(output_nram, weight_compute, input_wram, ci, 1, 1, 1, 1, 1, 1,
-                  co);
-    }
-
-  } else {
-    __bang_write_value(output_nram, channels, (T)0);
-  }
-}
-
-template <typename T>
-__mlu_func__ int32_t getReduceLevelByConvWramSize(const int32_t num_levels,
-                                                  const int32_t num_points,
-                                                  const int32_t channels) {
-  int32_t ci = 4 * num_levels * num_points;
-  int32_t pad_ci = PAD_UP(ci, WRAM_ALIGN_SIZE / sizeof(T));
-  int32_t co = channels;
-  int32_t pad_co = PAD_UP(co, LT_NUM);
-  return pad_co * pad_ci * sizeof(T);
-}
-
-__mlu_func__ void loadNram2Gpr(int32_t& v1, int32_t& v2, int32_t& v3,
-                               int32_t* p1, int32_t* p2, int32_t* p3) {
-  v1 = __load_nram(p1);
-  v2 = __load_nram(p2);
-  v3 = __load_nram(p3);
-}
-
-/*
-  Load 4 neighbors use one 3D-memcpy, just use offset of N1, stride_3_1 and
-  stride_2_1.
-        |<- stride_3_1 ->|
-        N1               N3
-        ^
-        |
-     stride_2_1
-        |
-        v
-        N2               N4
-
-  Trickly fold the loop as 2.
-*/
-template <typename T, mluMemcpyDirection_t DIR>
-__mlu_func__ void loadDataValueXram2NramAsync(
-    T* buf_value_nram_1, int32_t* offset_1, int32_t* stride_2_1,
-    int32_t* stride_3_1, T* value_src, const int32_t num_levels_points,
-    const int32_t channel_size, const int32_t value_stride_3_size) {
-  int32_t offset_1_a, stride_2_1_a, stride_3_1_a;
-  int32_t offset_1_b, stride_2_1_b, stride_3_1_b;
-  loadNram2Gpr(offset_1_a, stride_2_1_a, stride_3_1_a, offset_1, stride_2_1,
-               stride_3_1);
-  loadNram2Gpr(offset_1_b, stride_2_1_b, stride_3_1_b, offset_1 + 1,
-               stride_2_1 + 1, stride_3_1 + 1);
-  int32_t value_offset = 0;
-  int32_t next = 0;
-  int32_t loop_num = num_levels_points / 2;
-  int32_t remain = num_levels_points % 2;
-  int32_t data_value_stride = num_levels_points * channel_size;
-  for (int32_t j = 0; j < loop_num * 2; j += 2) {
-    value_offset = j * channel_size;
-    next = j + 2;
-    __memcpy_async((int8_t*)buf_value_nram_1 + value_offset,
-                   (int8_t*)value_src + offset_1_a, channel_size, DIR,
-                   2 * data_value_stride, 1, data_value_stride, 1, stride_3_1_a,
-                   1, stride_2_1_a, 1);
-
-    loadNram2Gpr(offset_1_a, stride_2_1_a, stride_3_1_a, offset_1 + next,
-                 stride_2_1 + next, stride_3_1 + next);
-
-    __memcpy_async((int8_t*)buf_value_nram_1 + value_offset + channel_size,
-                   (int8_t*)value_src + offset_1_b, channel_size, DIR,
-                   2 * data_value_stride, 1, data_value_stride, 1, stride_3_1_b,
-                   1, stride_2_1_b, 1);
-
-    loadNram2Gpr(offset_1_b, stride_2_1_b, stride_3_1_b, offset_1 + next + 1,
-                 stride_2_1 + next + 1, stride_3_1 + next + 1);
-  }
-
-  if (remain > 0) {
-    value_offset = loop_num * 2 * channel_size;
-    __memcpy_async((int8_t*)buf_value_nram_1 + value_offset,
-                   (int8_t*)value_src + offset_1_a, channel_size, DIR,
-                   2 * data_value_stride, 1, data_value_stride, 1, stride_3_1_a,
-                   1, stride_2_1_a, 1);
-  }
-}
-
-/*
-  use matmul to count valid samples.
-  sample_valid_count:     (deal_n)
-  cond_point_valid_nram:  (deal_n, num_levels, num_points)
-  nram_ones:              (num_levels, num_points)
-*/
-template <typename T>
-__mlu_func__ void countValidSamples(int32_t* sample_valid_count,
-                                    T* cond_point_valid_nram, T* nram_ones,
-                                    T* wram_buffer, int32_t num_levels,
-                                    int32_t num_points, int32_t deal_n) {
-  int32_t ci = num_levels * num_points;
-  int32_t pad_ci = PAD_UP(ci, WRAM_ALIGN_SIZE / sizeof(T));
-  int32_t co = deal_n;
-  int32_t pad_co = PAD_UP(co, LT_NUM);
-  tileWeight2WramSync(wram_buffer, cond_point_valid_nram, co, ci, pad_co,
-                      pad_ci);
-  __bang_conv((T*)sample_valid_count, nram_ones, wram_buffer, ci, 1, 1, 1, 1, 1,
-              1, co);
-  __bang_float2int32(sample_valid_count, (T*)sample_valid_count, deal_n, 0);
-}
-
-template <typename T, bool SRAM_STAY, int32_t REDUCE_TYPE = 0>
-__mlu_func__ void loadNeighborPolationAttn(
-    T* value_output_nram, T* value_sram, T* value_gdram,
-    int32_t* data_offset_nram, T* weight_polation_nram,
-    T* cond_point_polation_nram, T* cond_point_valid_nram, T* weight_attn_nram,
-    T* buf_nram, T* compute_buf_nram, T* nram_ones, const int32_t deal_n,
-    const int32_t num_levels, const int32_t num_points, const int32_t num_keys,
-    const int32_t channels, const bool w_contain_inf,
-    const bool value_contain_infnan) {
-  int32_t channel_size = channels * sizeof(T);
-  int32_t sample_stride_3 = deal_n * num_levels * num_points;
-  int32_t value_stride_3 = num_levels * num_points * channels;
-  int32_t value_stride_3_size = value_stride_3 * sizeof(T);
-  T* buf_value_nram = buf_nram;  // (4, num_levels, num_points, channels)
-  T* buf_value_nram_trans =
-      buf_nram + 4 * value_stride_3;  // (4, num_levels, num_points, channels)
-  T* buf_value_nram_pool =
-      buf_nram + 8 * value_stride_3;  // (1, num_levels, num_points, channels)
-  int32_t* sample_valid_count =
-      (int32_t*)(buf_nram + 9 * value_stride_3);  // (deal_n)
-  T* weight_compute_nram = compute_buf_nram;      // (4, num_levels, num_points)
-  int32_t* cond_compute_nram =
-      (int32_t*)(weight_compute_nram + 4 * num_levels * num_points);
-
-  countValidSamples(sample_valid_count, cond_point_valid_nram, nram_ones,
-                    (T*)wram_buffer, num_levels, num_points, deal_n);
-  __sync_compute();
-
-  int32_t* offset = data_offset_nram;
-  int32_t* stride_2_1 = offset + sample_stride_3;
-  int32_t* stride_3_1 = stride_2_1 + sample_stride_3;
-  T* output_nram = value_output_nram;
-  int32_t step_offset = 0;
-  T* value_src = SRAM_STAY ? value_sram : value_gdram;
-  for (int32_t i = 0; i < deal_n; i++) {
-    int32_t valid_num = sample_valid_count[i];
-    if (SRAM_STAY) {
-      loadDataValueXram2NramAsync<T, SRAM2NRAM>(
-          buf_value_nram, offset, stride_2_1, stride_3_1, value_src, valid_num,
-          channel_size, value_stride_3_size);
-      __sync_move();
-    } else {
-      loadDataValueXram2NramAsync<T, GDRAM2NRAM>(
-          buf_value_nram, offset, stride_2_1, stride_3_1, value_src, valid_num,
-          channel_size, value_stride_3_size);
-      __sync_io();
-    }
-    reduceLevelByConv(
-        output_nram, buf_value_nram, buf_value_nram_trans, buf_value_nram_pool,
-        (int32_t*)cond_point_polation_nram + step_offset,
-        weight_polation_nram + step_offset, weight_attn_nram + step_offset,
-        weight_compute_nram, cond_compute_nram, (T*)wram_buffer, valid_num,
-        channels, sample_stride_3, w_contain_inf, value_contain_infnan);
-    step_offset += valid_num;
-    offset = data_offset_nram + step_offset;
-    stride_2_1 = offset + sample_stride_3;
-    stride_3_1 = stride_2_1 + sample_stride_3;
-    output_nram += channels;
-  }
-}
-
-template <typename T>
-__mlu_func__ void prepareLoop(
-    T* ones_nram, int32_t* spatial_offset_nram, int32_t* spatial_hw_nram,
-    int8_t* mask_x_nram, int8_t* mask_y_nram, T* spatial_offset_bd_nram,
-    T* spatial_h_bd_nram, T* spatial_w_bd_nram, T* value_sram,
-    const char* data_level_start_index_gdram,
-    const char* data_spatial_shapes_gdram, const int32_t num_keys,
-    const int32_t num_levels, const int32_t num_points,
-    const int32_t max_deal_n, const int32_t mask_size, const int32_t channels) {
-  int32_t pad_num_points_levels =
-      PAD_UP(num_levels * num_points, WRAM_ALIGN_SIZE / sizeof(T));
-  __bang_write_value(ones_nram, pad_num_points_levels, (T)0);
-  __bang_write_value(ones_nram, num_levels * num_points, (T)1);
-  __bang_write_value(mask_x_nram, mask_size, (char)0x55);
-  __bang_write_value(mask_y_nram, mask_size, (char)0xAA);
-  __memcpy_async(spatial_offset_nram, data_level_start_index_gdram,
-                 num_levels * sizeof(int32_t), GDRAM2NRAM);
-  __memcpy_async(spatial_hw_nram, data_spatial_shapes_gdram,
-                 num_levels * 2 * sizeof(int32_t), GDRAM2NRAM);
-  __sync_io_move_compute();
-  broadcastSpatialHW(spatial_offset_bd_nram, spatial_h_bd_nram,
-                     spatial_w_bd_nram, spatial_hw_nram, spatial_offset_nram,
-                     num_levels, num_points);
-}
-
-template <typename T>
-__mlu_func__ void loadDataValueGdram2Sram(T* value_sram, T* data_value_gdram,
-                                          const int32_t batch_idx,
-                                          const int32_t head_idx,
-                                          const int32_t num_keys,
-                                          const int32_t num_heads,
-                                          const int32_t channels) {
-  int32_t loop_num = (num_keys + MAX_MEMCPY_SEGNUM - 1) / MAX_MEMCPY_SEGNUM;
-  int32_t num_heads_channels = num_heads * channels;
-  for (int32_t i = 0; i < loop_num; i++) {
-    int32_t load_num =
-        __mluop_min(MAX_MEMCPY_SEGNUM, num_keys - i * MAX_MEMCPY_SEGNUM);
-    size_t src_offset = ((size_t)batch_idx * num_keys + i * MAX_MEMCPY_SEGNUM) *
-                            num_heads_channels +
-                        head_idx * channels;
-    int32_t dst_offset = i * MAX_MEMCPY_SEGNUM * channels;
-    __memcpy(value_sram + dst_offset, (T*)data_value_gdram + src_offset,
-             channels * sizeof(T), GDRAM2SRAM, channels * sizeof(T),
-             num_heads_channels * sizeof(T), load_num - 1);
-  }
-}
-
-/*
-  The shape of each tensor:
-  ones_nram:                 (num_levels, num_points)
-  buf_compute_nram:          (8, num_levels, num_points)
-  spatial_offset_nram:       (num_levels)
-  spatial_hw_nram:           (num_levels, 2)
-  spatial_offset_bd_nram:    (num_levels, num_points)
-  spatial_w_bd_nram:         (num_levels, num_points)
-  spatial_h_bd_nram:         (num_levels, num_points)
-  mask_x_nram:               (deal_n, num_levels, num_points, 2) / 8
-  mask_y_nram:               (deal_n, num_levels, num_points, 2) / 8
-  value_output_nram:         (deal_n, channels)
-  data_offset_nram:          (4, deal_n, num_levels, num_points)
-  weight_polation_nram:      (4, deal_n, num_levels, num_points)
-  cond_point_polation_nram:  (4, deal_n, num_levels, num_points)
-  cond_point_valid_nram:     (deal_n, num_levels, num_points)
-  loc_nram:                  (deal_n, num_levels, num_points, 2)
-  weight_attn_nram:          (deal_n, num_levels, num_points)
-  buf_nram:                  (6, deal_n, num_levels, num_points)
-
-  Note: buf_nram is reused in polation computing.
-*/
-template <typename T>
-__mlu_func__ void memPolicyCommon(
-    T*& buf_compute_nram, T*& ones_nram, T*& value_output_nram,
-    int32_t*& data_offset_nram, T*& weight_polation_nram,
-    T*& cond_point_polation_nram, T*& cond_point_valid_nram, T*& loc_nram,
-    T*& weight_attn_nram, T*& buf_nram, T*& buf_nram_end, int8_t*& mask_x_nram,
-    int8_t*& mask_y_nram, T*& spatial_offset_bd_nram, T*& spatial_w_bd_nram,
-    T*& spatial_h_bd_nram, int32_t*& spatial_offset_nram,
-    int32_t*& spatial_hw_nram, T*& value_sram, int32_t& max_deal_n,
-    int32_t& mask_size, const int32_t batch_size, const int32_t num_keys,
-    const int32_t num_heads, const int32_t channels, const int32_t num_levels,
-    const int32_t num_queries, const int32_t num_points) {
-  int32_t num_points_levels = num_levels * num_points;
-  int32_t pad_num_points_levels =
-      PAD_UP(num_points_levels, WRAM_ALIGN_SIZE / sizeof(T));
-  int32_t pad_num_points_levels_8 =
-      PAD_UP(8 * num_points_levels, WRAM_ALIGN_SIZE / sizeof(T));
-  int32_t spatial_info_size =
-      PAD_UP(3 * num_levels * sizeof(int32_t), NFU_ALIGN_SIZE);
-  int32_t fix_space_size =
-      spatial_info_size + 2 * BIT_COLLECT_PAD * sizeof(T) +
-      (4 * pad_num_points_levels + pad_num_points_levels_8) * sizeof(T);
-  int32_t left_space_size = NRAM_AVALIABLE_SIZE - fix_space_size;
-  int32_t common_buffer_size_each = 6 * num_points_levels * sizeof(T);
-  int32_t inter_result_size_each =
-      17 * num_points_levels * sizeof(T) + channels * sizeof(T);
-
-  max_deal_n =
-      left_space_size / (common_buffer_size_each + inter_result_size_each);
-  int32_t compute_buffer_size =
-      (9 * num_points_levels * channels + max_deal_n) * sizeof(T);
-  int32_t common_buffer_size = max_deal_n * common_buffer_size_each;
-  // make sure buf_nram is large enough for compute
-  if (compute_buffer_size > common_buffer_size) {
-    int32_t tmp_deal_n =
-        (left_space_size - compute_buffer_size) / inter_result_size_each;
-    max_deal_n = __mluop_min(max_deal_n, tmp_deal_n);
-  }
-
-  int32_t reduce_need_wram_size =
-      getReduceLevelByConvWramSize<T>(num_levels, num_points, channels);
-  int32_t count_valid_max =
-      PAD_DOWN(WRAM_AVALIABLE_SIZE / sizeof(T) / pad_num_points_levels, LT_NUM);
-  int32_t wram_deal_n =
-      (int)(reduce_need_wram_size <= WRAM_AVALIABLE_SIZE) * count_valid_max;
-  max_deal_n = __mluop_min(max_deal_n, wram_deal_n);
-
-  int32_t total_points = max_deal_n * num_points_levels;
-  int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD);
-  mask_size = total_coord_pad / BIT_COLLECT_PAD;
-  ones_nram = (T*)nram_buffer;
-  buf_compute_nram = ones_nram + pad_num_points_levels;
-  spatial_offset_nram = (int32_t*)(buf_compute_nram + pad_num_points_levels_8);
-  spatial_hw_nram = spatial_offset_nram + num_levels;
-  spatial_offset_bd_nram = (T*)(spatial_hw_nram + num_levels * 2);
-  spatial_w_bd_nram = spatial_offset_bd_nram + num_points_levels;
-  spatial_h_bd_nram = spatial_w_bd_nram + num_points_levels;
-  mask_x_nram = (int8_t*)(spatial_h_bd_nram + num_points_levels);
-  mask_y_nram = mask_x_nram + mask_size;
-  value_output_nram = (T*)(mask_y_nram + mask_size);
-  data_offset_nram = (int32_t*)(value_output_nram + max_deal_n * channels);
-  weight_polation_nram = (T*)(data_offset_nram + 4 * total_points);
-  cond_point_polation_nram = weight_polation_nram + 4 * total_points;
-  cond_point_valid_nram = cond_point_polation_nram + 4 * total_points;
-  loc_nram = cond_point_valid_nram + total_points;
-  weight_attn_nram = loc_nram + total_coord_pad;
-  buf_nram = weight_attn_nram + total_points;
-  buf_nram_end = buf_nram + 6 * max_deal_n * num_points_levels;
-  value_sram = (T*)sram_buffer;
-}
-
-template <typename T, int32_t POLICY>
-__mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl(
-    const char* data_value_gdram, const char* data_spatial_shapes_gdram,
-    const char* data_level_start_index_gdram,
-    const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char* data_col_gdram) {
-  int32_t input_stride_4 = num_queries * num_heads * num_levels * num_points;
-  int32_t input_stride_3 = num_heads * num_levels * num_points;
-  int32_t input_stride_2 = num_levels * num_points;
-  int32_t output_stride_3 = num_queries * num_heads * channels;
-  int32_t output_stride_2 = num_heads * channels;
-  int32_t data_value_stride_3 = num_keys * num_heads * channels;
-  constexpr bool sram_stay = (POLICY == 0);
-
-  T* value_output_nram = nullptr;         // (deal_n, channels)
-  int32_t* data_offset_nram = nullptr;    // (4, deal_n, num_levels, num_points)
-  T* weight_polation_nram = nullptr;      // (4, deal_n, num_levels, num_points)
-  T* cond_point_polation_nram = nullptr;  // (4, deal_n, num_levels, num_points)
-  T* cond_point_valid_nram = nullptr;     // (deal_n, num_levels, num_points)
-  T* loc_nram = nullptr;                  // (deal_n, num_levels, num_points, 2)
-  T* weight_attn_nram = nullptr;          // (deal_n, num_levels, num_points)
-  T* buf_nram = nullptr;                  // (6, deal_n, num_levels, num_points)
-  T* buf_nram_end = nullptr;
-  int8_t* mask_x_nram = nullptr;  // (deal_n, num_levels, num_points, 2) / 8
-  int8_t* mask_y_nram = nullptr;  // (deal_n, num_levels, num_points, 2) / 8
-  T* spatial_offset_bd_nram = nullptr;     // (num_levels, num_points)
-  T* spatial_w_bd_nram = nullptr;          // (num_levels, num_points)
-  T* spatial_h_bd_nram = nullptr;          // (num_levels, num_points)
-  int32_t* spatial_offset_nram = nullptr;  // (num_levels)
-  int32_t* spatial_hw_nram = nullptr;      // (num_levels, 2)
-  T* buf_compute_nram = nullptr;           // (8, num_levels, num_points)
-  T* ones_nram = nullptr;                  // (1, num_levels, num_points)
-  T* value_sram = nullptr;                 // (num_keys, channels)
-  int32_t max_deal_n = 0;
-  int32_t mask_size = 0;
-  memPolicyCommon(buf_compute_nram, ones_nram, value_output_nram,
-                  data_offset_nram, weight_polation_nram,
-                  cond_point_polation_nram, cond_point_valid_nram, loc_nram,
-                  weight_attn_nram, buf_nram, buf_nram_end, mask_x_nram,
-                  mask_y_nram, spatial_offset_bd_nram, spatial_w_bd_nram,
-                  spatial_h_bd_nram, spatial_offset_nram, spatial_hw_nram,
-                  value_sram, max_deal_n, mask_size, batch_size, num_keys,
-                  num_heads, channels, num_levels, num_queries, num_points);
-  if (max_deal_n <= 0) {
-    return;
-  }
-
-  // split batch*head into taskDimY
-  int32_t batch_head = batch_size * num_heads;
-  int32_t cluster_avg_batch_head = (batch_head + taskDimY - 1) / taskDimY;
-  int32_t cluster_begin_batch_head = taskIdY * cluster_avg_batch_head;
-  int32_t cluster_act_batch_head = __mluop_min(
-      cluster_avg_batch_head, batch_head - cluster_begin_batch_head);
-  int32_t cluster_end_batch_head =
-      cluster_begin_batch_head + cluster_act_batch_head;
-  // split query into coreDim
-  int32_t core_avg_query = (num_queries + coreDim - 1) / coreDim;
-  int32_t core_begin_query = coreId * core_avg_query;
-  int32_t core_act_query =
-      __mluop_min(num_queries - core_begin_query, core_avg_query);
-  int32_t core_loop_num = (core_act_query + max_deal_n - 1) / max_deal_n;
-  int32_t core_step_query =
-      core_loop_num > 0 ? (core_act_query + core_loop_num - 1) / core_loop_num
-                        : 0;
-  int32_t core_remain_query =
-      core_act_query - (core_loop_num - 1) * core_step_query;
-  int32_t first_deal_query =
-      (int)(core_loop_num > 0) *
-      (core_loop_num > 1 ? core_step_query : core_remain_query);
-
-  prepareLoop(ones_nram, spatial_offset_nram, spatial_hw_nram, mask_x_nram,
-              mask_y_nram, spatial_offset_bd_nram, spatial_h_bd_nram,
-              spatial_w_bd_nram, value_sram, data_level_start_index_gdram,
-              data_spatial_shapes_gdram, num_keys, num_levels, num_points,
-              max_deal_n, mask_size, channels);
-
-  for (int32_t bh_idx = cluster_begin_batch_head;
-       bh_idx < cluster_end_batch_head; bh_idx++) {
-    int32_t b = bh_idx / num_heads;
-    int32_t head_idx = bh_idx % num_heads;
-    bool w_contain_inf = false;
-    bool value_contain_infnan = true;
-
-    size_t output_base_offset =
-        (size_t)b * output_stride_3 + head_idx * channels;
-    int32_t attn_weight_base_offset =
-        b * input_stride_4 + head_idx * input_stride_2;
-
-    if (sram_stay && __is_mpu()) {
-      loadDataValueGdram2Sram(value_sram, (T*)data_value_gdram, b, head_idx,
-                              num_keys, num_heads, channels);
-    }
-    __sync_cluster();
-
-    if (__is_ipu()) {
-      if (sram_stay) {
-        int32_t buf_size =
-            (int)((char*)buf_nram_end - (char*)value_output_nram);
-        isValueContainInfNan(value_sram, value_sram + num_keys * channels,
-                             value_output_nram, value_contain_infnan, buf_size,
-                             num_keys * channels);
-      }
-      // compute weight, offset and condition
-      int32_t attn_weight_offset =
-          attn_weight_base_offset + core_begin_query * input_stride_3;
-      int32_t loc_offset = attn_weight_offset * 2;
-      if (first_deal_query > 0) {
-        __memcpy(loc_nram, (T*)data_sampling_loc_gdram + loc_offset,
-                 input_stride_2 * 2 * sizeof(T), GDRAM2NRAM,
-                 input_stride_2 * 2 * sizeof(T), input_stride_3 * 2 * sizeof(T),
-                 first_deal_query - 1);
-        __memcpy(
-            weight_attn_nram, (T*)data_attn_weight_gdram + attn_weight_offset,
-            input_stride_2 * sizeof(T), GDRAM2NRAM, input_stride_2 * sizeof(T),
-            input_stride_3 * sizeof(T), first_deal_query - 1);
-        getConditionCoordWeight<T, sram_stay>(
-            data_offset_nram, weight_polation_nram, cond_point_polation_nram,
-            cond_point_valid_nram, loc_nram, weight_attn_nram, mask_x_nram,
-            mask_y_nram, spatial_offset_bd_nram, spatial_w_bd_nram,
-            spatial_h_bd_nram, buf_nram, w_contain_inf, value_contain_infnan,
-            first_deal_query, num_levels, num_points, num_heads, channels);
-      }
-    }
-
-    for (int32_t i = 0; __is_ipu() && i < core_loop_num; i++) {
-      int32_t deal_n =
-          i < core_loop_num - 1 ? core_step_query : core_remain_query;
-      int32_t load_n =
-          i < core_loop_num - 2 ? core_step_query : core_remain_query;
-      // load value and polation
-      loadNeighborPolationAttn<T, sram_stay>(
-          value_output_nram, value_sram,
-          (T*)data_value_gdram + b * data_value_stride_3 + head_idx * channels,
-          data_offset_nram, weight_polation_nram, cond_point_polation_nram,
-          cond_point_valid_nram, weight_attn_nram, buf_nram, buf_compute_nram,
-          ones_nram, deal_n, num_levels, num_points, num_keys, channels,
-          w_contain_inf, value_contain_infnan);
-      __sync_io_move_compute();
-      // load next weight and loc
-      if (i < core_loop_num - 1) {
-        int32_t core_query_offset = (i + 1) * core_step_query;
-        int32_t attn_weight_offset =
-            attn_weight_base_offset +
-            (core_begin_query + core_query_offset) * input_stride_3;
-        int32_t loc_offset = attn_weight_offset * 2;
-        __memcpy_async(loc_nram, (T*)data_sampling_loc_gdram + loc_offset,
-                       input_stride_2 * 2 * sizeof(T), GDRAM2NRAM,
-                       input_stride_2 * 2 * sizeof(T),
-                       input_stride_3 * 2 * sizeof(T), load_n - 1);
-        __memcpy_async(
-            weight_attn_nram, (T*)data_attn_weight_gdram + attn_weight_offset,
-            input_stride_2 * sizeof(T), GDRAM2NRAM, input_stride_2 * sizeof(T),
-            input_stride_3 * sizeof(T), load_n - 1);
-        __sync_io_move_compute();
-      }
-      // store result
-      size_t output_offset =
-          ((size_t)core_begin_query + i * core_step_query) * output_stride_2;
-      __memcpy_async((T*)data_col_gdram + output_base_offset + output_offset,
-                     value_output_nram, channels * sizeof(T), NRAM2GDRAM,
-                     output_stride_2 * sizeof(T), channels * sizeof(T),
-                     deal_n - 1);
-
-      // compute cond/weight/offset
-      if (i < core_loop_num - 1) {
-        getConditionCoordWeight<T, sram_stay>(
-            data_offset_nram, weight_polation_nram, cond_point_polation_nram,
-            cond_point_valid_nram, loc_nram, weight_attn_nram, mask_x_nram,
-            mask_y_nram, spatial_offset_bd_nram, spatial_w_bd_nram,
-            spatial_h_bd_nram, buf_nram, w_contain_inf, value_contain_infnan,
-            load_n, num_levels, num_points, num_heads, channels);
-      }
-      __sync_io_move_compute();
-    }
-    __sync_cluster();
-  }
-}
-#endif
-
-#if (__BANG_ARCH__ == 592)
-
-/*
-  The shape of each tensor on nram:
-  spatial_offset_nram:       (num_levels)
-  spatial_hw_nram:           (num_levels, 2)
-  spatial_offset_bd_nram:    (num_levels, num_points)
-  spatial_w_bd_nram:         (num_levels, num_points)
-  spatial_h_bd_nram:         (num_levels, num_points)
-  mask_x_nram:               (deal_n, num_levels, num_points, 2) / 8
-  mask_y_nram:               (deal_n, num_levels, num_points, 2) / 8
-  data_offset_nram:          (4, deal_n, num_levels, num_points)
-  weight_polation_nram:      (4, deal_n, num_levels, num_points)
-  cond_point_polation_nram:  (4, deal_n, num_levels, num_points)
-  cond_point_valid_nram:     (deal_n, num_levels, num_points)
-  loc_nram:                  (deal_n, num_levels, num_points, 2)
-  buf_nram:                  (6, deal_n, num_levels, num_points)
-
-  The shape of each tensor on sram:
-  data_offset_nram:          (4, deal_n, num_levels, num_points)
-  weight_polation_nram:      (4, deal_n, num_levels, num_points)
-  cond_point_polation_nram:  (4, deal_n, num_levels, num_points) / 8
-  cond_point_valid_nram:     (deal_n, num_levels, num_points)
-*/
-template <typename T>
-__mlu_func__ void memPolicy590(
-    T*& zeros_nram, int32_t*& data_offset_nram, T*& weight_polation_nram,
-    T*& cond_point_polation_nram, T*& cond_point_valid_nram, T*& loc_nram,
-    T*& buf_nram, T*& buf_nram_end, int8_t*& mask_x_nram, int8_t*& mask_y_nram,
-    T*& spatial_offset_bd_nram, T*& spatial_w_bd_nram, T*& spatial_h_bd_nram,
-    int32_t*& spatial_offset_nram, int32_t*& spatial_hw_nram, T*& value_ping,
-    T*& value_pong, T*& compute_buffer, T*& weight_polation_nram_stg2,
-    T*& weight_attn_nram_stg2, int32_t*& offset_nram_stg2, T*& output_nram,
-    T*& cond_nram_stg2, int32_t*& data_offset_sram, T*& weight_polation_sram,
-    T*& weight_attn_sram, T*& cond_point_polation_sram, char* nram_buffer,
-    char* sram_buffer, int32_t& max_cached_n, int32_t& stage_1_max_deal_n,
-    int32_t& stage_2_max_deal_n, int32_t& mask_size,
-    const int32_t nram_avaliable_size, const int32_t sram_avaliable_size,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points) {
-  int32_t num_points_levels = num_levels * num_points;
-  int32_t spatial_info_size =
-      PAD_UP(3 * num_levels * sizeof(int32_t), WRAM_ALIGN_SIZE);
-  int32_t spatial_info_bd_size =
-      PAD_UP(3 * num_points_levels * sizeof(T), WRAM_ALIGN_SIZE);
-  int32_t zeros_size = PAD_UP(channels * sizeof(T), WRAM_ALIGN_SIZE);
-  int32_t fix_space_size = spatial_info_size + 2 * BIT_COLLECT_PAD * sizeof(T) +
-                           spatial_info_bd_size + zeros_size;
-  int32_t left_space_size = nram_avaliable_size - fix_space_size;
-  stage_1_max_deal_n = left_space_size / (20 * num_points_levels * sizeof(T));
-  int32_t total_points = stage_1_max_deal_n * num_points_levels;
-  int32_t total_coord_pad = PAD_UP(total_points * 2, BIT_COLLECT_PAD);
-  mask_size = PAD_UP(total_coord_pad / BIT_COLLECT_PAD, WRAM_ALIGN_SIZE);
-  stage_2_max_deal_n =
-      (left_space_size - 2 * mask_size) /
-      ((12 * num_points_levels * channels + 17 * num_points_levels) *
-       sizeof(T));
-  // fix nram space
-  zeros_nram = (T*)(nram_buffer);
-  spatial_offset_nram = (int32_t*)(zeros_nram + zeros_size / sizeof(T));
-  spatial_hw_nram = spatial_offset_nram + num_levels;
-  spatial_offset_bd_nram =
-      (T*)((int8_t*)spatial_offset_nram + spatial_info_size);
-  spatial_w_bd_nram = spatial_offset_bd_nram + num_points_levels;
-  spatial_h_bd_nram = spatial_w_bd_nram + num_points_levels;
-  mask_x_nram = (int8_t*)spatial_offset_bd_nram + spatial_info_bd_size;
-  mask_y_nram = mask_x_nram + mask_size;
-  // stage1 nram space
-  // 4 + 4 + 4 + 1 + 6
-  data_offset_nram = (int32_t*)(mask_y_nram + mask_size);
-  weight_polation_nram = (T*)(data_offset_nram + 4 * total_points);
-  cond_point_polation_nram = weight_polation_nram + 4 * total_points;
-  cond_point_valid_nram = cond_point_polation_nram + 4 * total_points;
-  buf_nram = cond_point_valid_nram + total_points;
-  loc_nram = buf_nram + 4 * total_points;
-  buf_nram_end = buf_nram + 6 * total_points + total_coord_pad;
-  // stage2 nram space
-  int32_t total_points_stg2 = stage_2_max_deal_n * num_points_levels;
-  cond_nram_stg2 = (T*)(mask_y_nram + mask_size);
-  value_ping = cond_nram_stg2 + 4 * total_points_stg2 + BIT_COLLECT_PAD;
-  value_pong = value_ping + 4 * total_points_stg2 * channels;
-  compute_buffer = value_pong + 4 * total_points_stg2 * channels;
-  weight_polation_nram_stg2 = compute_buffer + 4 * total_points_stg2 * channels;
-  weight_attn_nram_stg2 = weight_polation_nram_stg2 + 4 * total_points_stg2;
-  offset_nram_stg2 = (int32_t*)(weight_attn_nram_stg2 + total_points_stg2);
-  // sram space: 4 + 4 + 1 + 4
-  int32_t polation_info_size = 13 * num_points_levels * sizeof(T);
-  int32_t avg_sram_size = sram_avaliable_size / coreDim;
-  max_cached_n = avg_sram_size / polation_info_size;
-  int max_cached_points = max_cached_n * num_points_levels;
-  T* sram_buf_base = (T*)(sram_buffer + avg_sram_size * coreId);
-  data_offset_sram = (int32_t*)sram_buf_base;
-  weight_polation_sram = (T*)(data_offset_sram + 4 * max_cached_points);
-  weight_attn_sram = (T*)(weight_polation_sram + 4 * max_cached_points);
-  cond_point_polation_sram = (T*)(weight_attn_sram + max_cached_points);
-}
-
-template <typename T>
-__mlu_func__ void forwardStageTwoLoop(
-    T* value_ping_nram, T* value_pong_nram, T* compute_buffer_nram,
-    T* zeros_nram, T* weight_polation_nram_stg2, T* weight_attn_nram_stg2,
-    int32_t* offset_nram_stg2, T* output_nram, T* cond_nram_stg2,
-    int32_t* data_offset_sram, T* weight_polation_sram, T* weight_attn_sram,
-    T* cond_point_polation_sram, T* data_value_gdram, T* weight_attn_gdram,
-    T* output_gdram, const int32_t total_deal_n, const int32_t max_deal_n,
-    const int32_t input_stride_2, const int32_t input_stride_3,
-    const int32_t output_stride_2, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels,
-    const int32_t num_points) {
-  int32_t loop_num = (total_deal_n + max_deal_n - 1) / max_deal_n;
-  int32_t num_levels_points = num_levels * num_points;
-  int32_t sram_src_stride = total_deal_n * num_levels_points * sizeof(T);
-  T* value_nram[2] = {value_ping_nram, value_pong_nram};
-  int32_t* offset_zero_nram_stg2 =
-      offset_nram_stg2 + 4 * max_deal_n * num_levels_points;
-  for (int32_t i = 0; i < loop_num + 1; i++) {
-    int32_t compute_idx = i - 1;
-    int32_t compute_offset = compute_idx * max_deal_n;
-    int32_t load_n = std::min(total_deal_n - i * max_deal_n, max_deal_n);
-    int32_t compute_n =
-        std::min(total_deal_n - compute_idx * max_deal_n, max_deal_n);
-    int32_t load_point_num = 4 * load_n * num_levels_points;
-    int32_t nq_nlp_4 = compute_n * num_levels_points * 4;
-    int32_t nq_nlp = compute_n * num_levels_points;
-
-    int32_t total_point_pad_8 = PAD_UP(load_point_num, BIT_COLLECT_PAD);
-    int32_t gather_mask_size = total_point_pad_8 / BIT_COLLECT_PAD;
-    T* v_compute = value_nram[compute_idx % 2];
-    T* v_load = value_nram[i % 2];
-    int8_t* cond_nram_stg2_reverse = (int8_t*)cond_nram_stg2 + gather_mask_size;
-
-    if (i > 0) {
-      int32_t copy_size_1 = compute_n * num_levels_points * sizeof(T);
-      int32_t sram_src_offset = compute_idx * max_deal_n * num_levels_points;
-      __memcpy_async(weight_polation_nram_stg2,
-                     weight_polation_sram + sram_src_offset, copy_size_1,
-                     SRAM2NRAM, copy_size_1, sram_src_stride, 3);
-      __memcpy_async(weight_attn_nram_stg2, weight_attn_sram + sram_src_offset,
-                     copy_size_1, SRAM2NRAM);
-    }
-
-    if (i < loop_num) {
-      int32_t copy_size_1 = load_n * num_levels_points * sizeof(T);
-      int32_t copy_size_2 = load_n * num_levels_points * sizeof(int32_t);
-      int32_t sram_src_offset = i * max_deal_n * num_levels_points;
-      __memcpy_async(offset_nram_stg2, data_offset_sram + sram_src_offset,
-                     copy_size_2, SRAM2NRAM, copy_size_2, sram_src_stride, 3);
-      __memcpy_async(cond_nram_stg2, cond_point_polation_sram + sram_src_offset,
-                     copy_size_1, SRAM2NRAM, copy_size_1, sram_src_stride, 3);
-      __bang_write_value(compute_buffer_nram, load_point_num, (T)0);
-      __bang_write_value(offset_zero_nram_stg2, load_point_num, (int32_t)0);
-      __sync_move();
-      __bang_gt_bitindex(cond_nram_stg2, cond_nram_stg2, compute_buffer_nram,
-                         total_point_pad_8);
-      __bang_bnot((char*)cond_nram_stg2_reverse, (char*)cond_nram_stg2,
-                  gather_mask_size);
-    }
-
-    __sync_io_move_compute();
-
-    if (i < loop_num) {
-      gatherAsync(v_load, zeros_nram, (unsigned int*)offset_zero_nram_stg2,
-                  cond_nram_stg2_reverse, channels * sizeof(T), NRAM2NRAM,
-                  channels * sizeof(T), load_point_num);
-      gatherAsync(v_load, data_value_gdram, (unsigned int*)offset_nram_stg2,
-                  cond_nram_stg2, channels * sizeof(T), GDRAM2NRAM,
-                  channels * sizeof(T), load_point_num);
-    }
-
-    if (i > 0) {
-      __bang_transpose(compute_buffer_nram, v_compute, nq_nlp_4, channels);
-      __bang_cycle_mul(compute_buffer_nram, compute_buffer_nram,
-                       weight_polation_nram_stg2, channels * nq_nlp_4,
-                       nq_nlp_4);
-      __bang_sumpool(v_compute, compute_buffer_nram, nq_nlp, channels, 4, 1, 4,
-                     1, 1);
-      __bang_cycle_mul(v_compute, v_compute, weight_attn_nram_stg2,
-                       channels * nq_nlp, nq_nlp);
-      __bang_transpose(compute_buffer_nram, v_compute, channels, nq_nlp);
-      __bang_sumpool(v_compute, compute_buffer_nram, channels, compute_n,
-                     num_levels_points, 1, num_levels_points, 1, 1);
-      __memcpy(output_gdram + compute_offset * output_stride_2, v_compute,
-               channels * sizeof(T), NRAM2GDRAM, output_stride_2 * sizeof(T),
-               channels * sizeof(T), compute_n - 1);
-    }
-    __sync_io_move_compute();
-  }
-}
-
-// only for 590
-template <typename T>
-__mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl(
-    const char* data_value_gdram, const char* data_spatial_shapes_gdram,
-    const char* data_level_start_index_gdram,
-    const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char* data_col_gdram) {
-  int32_t input_stride_4 = num_queries * num_heads * num_levels * num_points;
-  int32_t input_stride_3 = num_heads * num_levels * num_points;
-  int32_t input_stride_2 = num_levels * num_points;
-  int32_t output_stride_3 = num_queries * num_heads * channels;
-  int32_t output_stride_2 = num_heads * channels;
-  int32_t data_value_stride_3 = num_keys * num_heads * channels;
-
-  T* zeros_nram = nullptr;                // (channels)
-  int32_t* data_offset_nram = nullptr;    // (4, deal_n, num_levels, num_points)
-  T* weight_polation_nram = nullptr;      // (4, deal_n, num_levels, num_points)
-  T* cond_point_polation_nram = nullptr;  // (4, deal_n, num_levels, num_points)
-  T* cond_point_valid_nram = nullptr;     // (deal_n, num_levels, num_points)
-  T* loc_nram = nullptr;                  // (deal_n, num_levels, num_points, 2)
-  T* buf_nram = nullptr;                  // (6, deal_n, num_levels, num_points)
-  T* buf_nram_end = nullptr;
-  int8_t* mask_x_nram = nullptr;  // (deal_n, num_levels, num_points, 2) / 8
-  int8_t* mask_y_nram = nullptr;  // (deal_n, num_levels, num_points, 2) / 8
-  T* spatial_offset_bd_nram = nullptr;     // (num_levels, num_points)
-  T* spatial_w_bd_nram = nullptr;          // (num_levels, num_points)
-  T* spatial_h_bd_nram = nullptr;          // (num_levels, num_points)
-  int32_t* spatial_offset_nram = nullptr;  // (num_levels)
-  int32_t* spatial_hw_nram = nullptr;      // (num_levels, 2)
-  T* value_ping_nram = nullptr;  // (deal_n, num_levels, num_points, channels)
-  T* value_pong_nram = nullptr;  // (deal_n, num_levels, num_points, channels)
-  T* compute_buffer_nram =
-      nullptr;  // (deal_n, num_levels, num_points, channels)
-  T* weight_polation_nram_stg2 =
-      nullptr;                          // (4, deal_n, num_levels, num_points)
-  T* weight_attn_nram_stg2 = nullptr;   // (1, deal_n, num_levels, num_points)
-  int32_t* offset_nram_stg2 = nullptr;  // (4, deal_n, num_levels, num_points)
-  T* output_nram = nullptr;             // (deal_n, channels)
-  T* cond_nram_stg2 = nullptr;          // (4, deal_n, num_levels, num_points)
-  T* value_sram = nullptr;              // (num_keys, channels)
-  int32_t* data_offset_sram = nullptr;
-  T* weight_polation_sram = nullptr;
-  T* wegith_attn_sram = nullptr;
-  T* cond_point_polation_sram = nullptr;
-  int32_t stage_1_max_deal_n = 0;
-  int32_t stage_2_max_deal_n = 0;
-  int32_t max_cached_n = 0;
-  int32_t mask_size = 0;
-  memPolicy590(
-      zeros_nram, data_offset_nram, weight_polation_nram,
-      cond_point_polation_nram, cond_point_valid_nram, loc_nram, buf_nram,
-      buf_nram_end, mask_x_nram, mask_y_nram, spatial_offset_bd_nram,
-      spatial_w_bd_nram, spatial_h_bd_nram, spatial_offset_nram,
-      spatial_hw_nram, value_ping_nram, value_pong_nram, compute_buffer_nram,
-      weight_polation_nram_stg2, weight_attn_nram_stg2, offset_nram_stg2,
-      output_nram, cond_nram_stg2, data_offset_sram, weight_polation_sram,
-      wegith_attn_sram, cond_point_polation_sram, nram_buffer, sram_buffer,
-      max_cached_n, stage_1_max_deal_n, stage_2_max_deal_n, mask_size,
-      NRAM_AVALIABLE_SIZE, SRAM_AVALIABLE_SIZE, batch_size, num_keys, num_heads,
-      channels, num_levels, num_queries, num_points);
-  if (stage_1_max_deal_n <= 0 || stage_2_max_deal_n <= 0) {
-    return;
-  }
-
-  int32_t cluster_begin_batch_head = 0;
-  int32_t cluster_act_batch_head = 0;
-  int32_t cluster_end_batch_head = 0;
-  int32_t core_begin_query = 0;
-  int32_t core_act_query = 0;
-  int32_t core_loop_num = 0;
-  int32_t core_step_query = 0;
-  splitTaskV2(cluster_begin_batch_head, cluster_act_batch_head,
-              cluster_end_batch_head, core_begin_query, core_act_query,
-              core_loop_num, core_step_query, max_cached_n, batch_size,
-              num_keys, num_heads, channels, num_levels, num_queries,
-              num_points);
-
-  prepareLoopV2((int32_t*)nullptr, zeros_nram, spatial_offset_nram,
-                spatial_hw_nram, mask_x_nram, mask_y_nram,
-                spatial_offset_bd_nram, spatial_h_bd_nram, spatial_w_bd_nram,
-                value_sram, data_level_start_index_gdram,
-                data_spatial_shapes_gdram, num_keys, num_levels, num_points,
-                stage_1_max_deal_n, mask_size, channels);
-
-  for (int32_t bh_idx = cluster_begin_batch_head;
-       bh_idx < cluster_end_batch_head; bh_idx++) {
-    int32_t b = bh_idx / num_heads;
-    int32_t head_idx = bh_idx % num_heads;
-    size_t output_base_offset =
-        (size_t)b * output_stride_3 + head_idx * channels;
-    size_t attn_weight_base_offset =
-        (size_t)b * input_stride_4 + head_idx * input_stride_2;
-    size_t data_value_base_offset =
-        (size_t)b * data_value_stride_3 + head_idx * channels;
-
-    for (int32_t i = 0; __is_ipu() && i < core_loop_num; i++) {
-      int32_t deal_n =
-          std::min(core_act_query - core_step_query * i, core_step_query);
-      int32_t core_query_offset = i * core_step_query;
-      size_t attn_weight_offset =
-          attn_weight_base_offset +
-          (core_begin_query + core_query_offset) * input_stride_3;
-      size_t loc_offset = attn_weight_offset * 2;
-      size_t output_offset =
-          output_base_offset +
-          (core_begin_query + i * core_step_query) * output_stride_2;
-
-      // compute offset/cond/wp
-      stageOneLoop((T*)data_sampling_loc_gdram + loc_offset,
-                   (T*)data_attn_weight_gdram + attn_weight_offset,
-                   data_offset_nram, nullptr, weight_polation_nram,
-                   cond_point_polation_nram, cond_point_valid_nram, loc_nram,
-                   buf_nram, buf_nram_end, mask_x_nram, mask_y_nram,
-                   spatial_offset_bd_nram, spatial_w_bd_nram, spatial_h_bd_nram,
-                   spatial_offset_nram, spatial_hw_nram, data_offset_sram,
-                   nullptr, weight_polation_sram, wegith_attn_sram,
-                   cond_point_polation_sram, false, false, deal_n,
-                   stage_1_max_deal_n, num_heads, channels, num_levels,
-                   num_points, input_stride_2, input_stride_3);
-
-      // compute and store output
-      forwardStageTwoLoop(
-          value_ping_nram, value_pong_nram, compute_buffer_nram, zeros_nram,
-          weight_polation_nram_stg2, weight_attn_nram_stg2, offset_nram_stg2,
-          output_nram, cond_nram_stg2, data_offset_sram, weight_polation_sram,
-          wegith_attn_sram, cond_point_polation_sram,
-          (T*)data_value_gdram + data_value_base_offset,
-          (T*)data_attn_weight_gdram + attn_weight_offset,
-          (T*)data_col_gdram + output_offset, deal_n, stage_2_max_deal_n,
-          input_stride_2, input_stride_3, output_stride_2, num_heads, channels,
-          num_levels, num_points);
-    }
-  }
-}
-
-#endif
-
-template <typename T>
-__mlu_global__ void MLUKernelMsDeformAttnForwardFast(
-    const char* data_value_gdram, const char* data_spatial_shapes_gdram,
-    const char* data_level_start_index_gdram,
-    const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char* data_col_gdram) {
-#if (__BANG_ARCH__ == 372)
-  size_t single_value_size = num_keys * channels * sizeof(T);
-  if (single_value_size <= SRAM_FOR_VALUE_SIZE) {
-    MLUKernelMsDeformAttnForwardFastImpl<float, 0>(
-        data_value_gdram, data_spatial_shapes_gdram,
-        data_level_start_index_gdram, data_sampling_loc_gdram,
-        data_attn_weight_gdram, batch_size, num_keys, num_heads, channels,
-        num_levels, num_queries, num_points, data_col_gdram);
-  } else {
-    MLUKernelMsDeformAttnForwardFastImpl<float, 1>(
-        data_value_gdram, data_spatial_shapes_gdram,
-        data_level_start_index_gdram, data_sampling_loc_gdram,
-        data_attn_weight_gdram, batch_size, num_keys, num_heads, channels,
-        num_levels, num_queries, num_points, data_col_gdram);
-  }
-#endif
-
-#if (__BANG_ARCH__ == 592)
-  MLUKernelMsDeformAttnForwardFastImpl<float>(
-      data_value_gdram, data_spatial_shapes_gdram, data_level_start_index_gdram,
-      data_sampling_loc_gdram, data_attn_weight_gdram, batch_size, num_keys,
-      num_heads, channels, num_levels, num_queries, num_points, data_col_gdram);
-#endif
-}
-
-template __mlu_global__ void MLUKernelMsDeformAttnForwardFast<float>(
-    const char* data_value_gdram, const char* data_spatial_shapes_gdram,
-    const char* data_level_start_index_gdram,
-    const char* data_sampling_loc_gdram, const char* data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char* data_col_gdram);
diff --git a/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu b/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu
deleted file mode 100644
index 398fe9679..000000000
--- a/kernels/ms_deform_attn_forward/msda_forward_small_channel_union1.mlu
+++ /dev/null
@@ -1,557 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <math.h>
-
-#include "kernels/ms_deform_attn_forward/ms_deform_attn_forward.h"
-
-#define ELE_COUNT 32 /* cycle element count */
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-__mlu_func__ void genMask0101(float *mask_ram, int32_t size) {
-#if __BANG_ARCH__ >= 372
-  int32_t align_num = NFU_ALIGN_SIZE / sizeof(float);
-  for (int32_t i = 0; i < align_num; ++i) {
-    mask_ram[i] = i % 2;
-  }
-  __sync();
-  // NOTE: when channel is 1, mask_ram may be overwritten, since we
-  // align size to CEIL_ALIGN(size, align_num)
-  __memcpy(mask_ram + align_num, mask_ram, NFU_ALIGN_SIZE, NRAM2NRAM,
-           NFU_ALIGN_SIZE, 0, (size / align_num + (size % align_num > 0)) - 2);
-  __sync();
-#endif
-}
-
-template <typename T>
-__mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram) {
-#if __BANG_ARCH__ >= 372
-  if (__is_mpu()) {
-    return;
-  }
-  size_t block_num_per_core = 0, batch_start = 0, deal_g = 0, offset_g = 0;
-  size_t block_num_rem = 0;
-  const size_t grid_total = num_queries * num_heads * num_levels * num_points;
-  if (batch_size >= taskDim) {
-    block_num_rem = batch_size % taskDim;
-    block_num_per_core = taskId < block_num_rem ? batch_size / taskDim + 1
-                                                : batch_size / taskDim;
-    batch_start = taskId < block_num_rem
-                      ? taskId * block_num_per_core
-                      : taskId * block_num_per_core + block_num_rem;
-    deal_g = grid_total;
-    offset_g = 0;
-  } else {
-    size_t skip_n = taskDim / batch_size;
-    batch_start = taskId / skip_n;
-    block_num_per_core = batch_start >= batch_size ? 0 : 1;
-    deal_g = PAD_UP(grid_total / skip_n, num_levels * num_points);
-    size_t id = taskId % skip_n;
-    offset_g = id * deal_g;
-    deal_g = offset_g > grid_total ? 0
-                                   : ((id + 1) *deal_g > grid_total
-                                          ? deal_g = grid_total - offset_g
-                                          : deal_g);
-  }
-  if (deal_g == 0) {
-    return;
-  }
-  const int32_t float_align = NFU_ALIGN_SIZE / sizeof(float);
-  int32_t deal_num = 1;
-  int32_t cut_channel_iter = 2;
-  const size_t spatial_size =
-      PAD_UP(num_levels * 2 * sizeof(int32_t), NFU_ALIGN_SIZE);
-  const size_t level_start_index_size =
-      PAD_UP(num_levels * sizeof(int32_t), NFU_ALIGN_SIZE);
-  int32_t channel = channels;
-  int32_t mult;
-  while (true) {
-    deal_num = (MAX_NRAM_SIZE - spatial_size - level_start_index_size) /
-               (8 * channel + 8) / sizeof(T);
-    deal_num = PAD_DOWN(deal_num, float_align);
-    deal_num = PAD_DOWN(deal_num, num_levels * num_points);
-    if (deal_num > 0) {
-      break;
-    } else {
-      channel = channels / cut_channel_iter;
-      cut_channel_iter += 2;
-    }
-  }
-  mult = channel;
-  const int32_t c_rep = channels / channel;
-  const int32_t c_rem = channels % channel;
-  const int32_t g_rep = deal_g / deal_num;
-  const int32_t g_rem = deal_g % deal_num;
-  // nram buffer alloc
-  char *data_spatial_shapes_nram = nram_buffer;
-  char *data_level_start_index_nram = data_spatial_shapes_nram + spatial_size;
-  char *input_tl = data_level_start_index_nram + level_start_index_size;
-  char *input_tr = input_tl + deal_num * mult * sizeof(T);
-  char *input_bl = input_tr + deal_num * mult * sizeof(T);
-  char *input_br = input_bl + deal_num * mult * sizeof(T);
-  char *weight_tl = input_tl + 4 * deal_num * mult * sizeof(T);
-  char *weight_tr = weight_tl + deal_num * mult * sizeof(T);
-  char *weight_bl = weight_tr + deal_num * mult * sizeof(T);
-  char *weight_br = weight_bl + deal_num * mult * sizeof(T);
-  char *mask_tl = weight_br + deal_num * mult * sizeof(T);
-  char *mask_tr = mask_tl + deal_num * sizeof(T);
-  char *mask_bl = mask_tr + deal_num * sizeof(T);
-  char *mask_br = mask_bl + deal_num * sizeof(T);
-  char *point_ram = mask_br + deal_num * sizeof(T);
-  char *index_tl = point_ram + deal_num * sizeof(T);
-  char *index_bl = index_tl + deal_num * sizeof(T);
-  char *valid_mask = index_bl + deal_num * sizeof(T);
-  // nram space reuse
-  char *grid_ram = weight_tl;
-  char *mask_ram = weight_bl;
-  char *coord_x = input_bl;
-  char *coord_y = coord_x + deal_num * sizeof(T);
-  char *coord_x_low = input_tl;
-  char *coord_y_low = coord_x_low + deal_num * sizeof(T);
-  char *coord_x_low_int = weight_tl;
-  char *coord_y_low_int = weight_tr;
-  char *spatial_x = mask_tl;
-  char *spatial_y = mask_tr;
-  char *spatial_x_float = weight_bl;
-  char *spatial_y_float = weight_br;
-  char *spatial_x_temp = mask_bl;
-  char *spatial_y_temp = mask_br;
-#if MS_DEFORM_ATTN_FORWARD_HEADVECTOR
-  char *base_ptr_offset = weight_tl;
-#endif
-  char *auxiliary_a = point_ram;
-  char *auxiliary_b = weight_bl;
-  __memcpy_async(data_spatial_shapes_nram, data_spatial_shapes_gdram,
-                 num_levels * 2 * sizeof(int32_t), GDRAM2NRAM);
-  __memcpy_async(data_level_start_index_nram, data_level_start_index_gdram,
-                 num_levels * sizeof(int32_t), GDRAM2NRAM);
-  __sync();
-  for (int32_t batch_idx = batch_start;
-       batch_idx < batch_start + block_num_per_core; ++batch_idx) {
-    for (int32_t grid_iter = 0; grid_iter <= g_rep; ++grid_iter) {
-      int32_t io_data_num = deal_num;
-      const int32_t grid_off_base =
-          batch_idx * grid_total + offset_g + grid_iter * deal_num;
-      if (grid_iter == g_rep) {
-        if (g_rem == 0) {
-          continue;
-        } else {
-          io_data_num = g_rem;
-        }
-      }
-      char *data_col_gdram_start =
-          data_col_gdram + (batch_idx * num_queries * num_heads * channels +
-                            (offset_g + grid_iter * deal_num) /
-                                (num_levels * num_points) * channels) *
-                               sizeof(float);
-      // load data_sampling_loc
-      __memcpy_async(
-          grid_ram, data_sampling_loc_gdram + grid_off_base * 2 * sizeof(float),
-          io_data_num * 2 * sizeof(float), GDRAM2NRAM);
-      genMask0101((float *)mask_ram, deal_num * 2);
-      __sync();
-      // generate x and y coordinate vector
-      // generate spatial_x and spatial_y spatial vector
-      __bang_collect((float *)coord_y, (float *)grid_ram, (float *)mask_ram,
-                     deal_num * 2);  // y
-      __bang_collect((float *)spatial_x_temp, (float *)data_spatial_shapes_nram,
-                     (float *)mask_ram,
-                     num_levels * 2);  // spatial_x
-      __bang_not((float *)mask_ram, (float *)mask_ram, deal_num * 2);
-      __bang_collect((float *)coord_x, (float *)grid_ram, (float *)mask_ram,
-                     deal_num * 2);  // x
-      __bang_collect((float *)spatial_y_temp, (float *)data_spatial_shapes_nram,
-                     (float *)mask_ram,
-                     num_levels * 2);  // spatial_y
-      for (int32_t i = 0; i < num_levels; i++) {
-        __bang_write_value((int32_t *)spatial_x + i * num_points, num_points,
-                           ((int32_t *)spatial_x_temp)[i]);
-        __bang_write_value((int32_t *)spatial_y + i * num_points, num_points,
-                           ((int32_t *)spatial_y_temp)[i]);
-      }
-      __bang_int322float_rd((float *)spatial_x_float, (int32_t *)spatial_x,
-                            num_levels * num_points, 0);
-      __bang_int322float_rd((float *)spatial_y_float, (int32_t *)spatial_y,
-                            num_levels * num_points, 0);
-      /*
-        map x from [0, 1] to [0, spatial_x];
-        map y from [0, 1] to [0, spatial_y]
-      */
-      __bang_cycle_mul((float *)coord_x, (float *)coord_x,
-                       (float *)spatial_x_float, deal_num,
-                       num_levels * num_points);
-      __bang_sub_scalar((float *)coord_x, (float *)coord_x, (float)0.5,
-                        deal_num);
-      __bang_cycle_mul((float *)coord_y, (float *)coord_y,
-                       (float *)spatial_y_float, deal_num,
-                       num_levels * num_points);
-      __bang_sub_scalar((float *)coord_y, (float *)coord_y, (float)0.5,
-                        deal_num);
-      // generate valid mask, which means the location is nan/inf or not
-      // condition coordx > -1 / coordy > -1
-      __bang_gt_scalar((float *)auxiliary_a, (float *)coord_x, -1.0, deal_num);
-      __bang_move((char *)valid_mask, (char *)auxiliary_a,
-                  deal_num * sizeof(float));
-      __bang_gt_scalar((float *)auxiliary_a, (float *)coord_y, -1.0, deal_num);
-      __bang_add((float *)valid_mask, (float *)valid_mask, (float *)auxiliary_a,
-                 deal_num);
-
-      // condition coordx < spatial_x / coordy < spatial_y
-      __bang_cycle_le((float *)mask_bl, (float *)coord_x,
-                      (float *)spatial_x_float, deal_num,
-                      num_levels * num_points);
-      __bang_cycle_le((float *)mask_br, (float *)coord_y,
-                      (float *)spatial_y_float, deal_num,
-                      num_levels * num_points);
-
-      __bang_add((float *)mask_bl, (float *)mask_bl, (float *)mask_br,
-                 deal_num);
-      __bang_add((float *)valid_mask, (float *)valid_mask, (float *)mask_bl,
-                 deal_num);
-      // all condition satisfied, value should be 4.
-      __bang_eq_scalar((float *)valid_mask, (float *)valid_mask, 4, deal_num);
-
-      // get floor value of coord
-      __bang_floor((float *)coord_x_low, (float *)coord_x, deal_num);
-      __bang_floor((float *)coord_y_low, (float *)coord_y, deal_num);
-      // calc index_tl
-      const int32_t w_stride = num_heads * channels;
-      __bang_float2int32_rd((int32_t *)coord_x_low_int, (float *)coord_x_low,
-                            deal_num, 0);
-      __bang_float2int32_rd((int32_t *)coord_y_low_int, (float *)coord_y_low,
-                            deal_num, 0);
-      __bang_cycle_mul((int32_t *)index_tl, (int32_t *)coord_y_low_int,
-                       (int32_t *)spatial_x, deal_num, num_levels * num_points);
-      __bang_add((int32_t *)index_tl, (int32_t *)index_tl,
-                 (int32_t *)coord_x_low_int, deal_num);
-      __bang_mul_scalar((int32_t *)index_tl, (int32_t *)index_tl, w_stride,
-                        deal_num);
-#if MS_DEFORM_ATTN_FORWARD_HEADVECTOR
-      const int32_t deal_lp_num = deal_num / (num_levels * num_points);
-      const int32_t h_rep = deal_lp_num / num_heads;
-      const int32_t h_rem = deal_lp_num % num_heads;
-      const int32_t head_start =
-          ((offset_g + grid_iter * deal_num) / (num_levels * num_points)) %
-          num_heads;
-      for (int32_t iter = 0; iter < num_heads; ++iter) {
-        ((int32_t *)base_ptr_offset)[iter] =
-            ((head_start + iter) % num_heads) * channels;
-      }
-      if (h_rep > 0) {
-        __memcpy((int32_t *)base_ptr_offset + num_heads,
-                 (int32_t *)base_ptr_offset, num_heads * sizeof(int32_t),
-                 NRAM2NRAM, num_heads * sizeof(int32_t), 0, h_rep - 1);
-      }
-      if (h_rep > 0 && h_rem > 0) {
-        __memcpy((int32_t *)base_ptr_offset + h_rep * num_heads,
-                 (int32_t *)base_ptr_offset, h_rem * sizeof(int32_t),
-                 NRAM2NRAM);
-      }
-      __bang_transpose((int32_t *)auxiliary_a, (int32_t *)index_tl, deal_lp_num,
-                       num_levels * num_points);
-      __bang_cycle_add((int32_t *)auxiliary_a, (int32_t *)auxiliary_a,
-                       (int32_t *)base_ptr_offset, deal_num, deal_lp_num);
-      __bang_transpose((int32_t *)index_tl, (int32_t *)auxiliary_a,
-                       num_levels * num_points, deal_lp_num);
-#endif
-      // calc index_bl
-      __bang_mul_scalar((int32_t *)auxiliary_a, (int32_t *)spatial_x, w_stride,
-                        deal_num);
-      __bang_cycle_add((int32_t *)index_bl, (int32_t *)index_tl,
-                       (int32_t *)auxiliary_a, deal_num,
-                       num_levels * num_points);
-      // calc mask_tl, mask_tr, mask_bl, mask_br
-      __bang_sub_scalar((float *)spatial_x_float, (float *)spatial_x_float,
-                        (float)1.0, deal_num);
-      __bang_sub_scalar((float *)spatial_y_float, (float *)spatial_y_float,
-                        (float)1.0, deal_num);
-      // mask_tl :
-      // 0 <= coord_x_low < spatial_x && 0 <= coord_y_low < spatial_y
-      __bang_ge_scalar((float *)mask_bl, (float *)coord_x_low, (float)0,
-                       deal_num);
-      __bang_cycle_le((float *)mask_br, (float *)coord_x_low,
-                      (float *)spatial_x_float, deal_num,
-                      num_levels * num_points);
-      __bang_and((float *)mask_bl, (float *)mask_bl, (float *)mask_br,
-                 deal_num);
-      __bang_ge_scalar((float *)mask_tr, (float *)coord_y_low, (float)0,
-                       deal_num);
-      __bang_cycle_le((float *)mask_br, (float *)coord_y_low,
-                      (float *)spatial_y_float, deal_num,
-                      num_levels * num_points);
-      __bang_and((float *)mask_tr, (float *)mask_tr, (float *)mask_br,
-                 deal_num);
-      __bang_and((float *)mask_tl, (float *)mask_tr, (float *)mask_bl,
-                 deal_num);
-      // mask_tr :
-      // 0 <= coord_x_high < spatial_x && 0 <= coord_y_low < spatial_y
-      __bang_ge_scalar((float *)mask_br, (float *)coord_x_low, (float)(-1.0),
-                       deal_num);
-      __bang_cycle_lt((float *)auxiliary_a, (float *)coord_x_low,
-                      (float *)spatial_x_float, deal_num,
-                      num_levels * num_points);
-      __bang_and((float *)mask_br, (float *)mask_br, (float *)auxiliary_a,
-                 deal_num);
-      __bang_and((float *)mask_tr, (float *)mask_tr, (float *)mask_br,
-                 deal_num);
-      // mask_bl :
-      // 0 <= coord_x_low < spatial_x && 0 <= coord_y_high < spatial_y
-      __bang_ge_scalar((float *)auxiliary_a, (float *)coord_y_low,
-                       (float)(-1.0), deal_num);
-      __bang_cycle_lt((float *)auxiliary_b, (float *)coord_y_low,
-                      (float *)spatial_y_float, deal_num,
-                      num_levels * num_points);
-      __bang_and((float *)auxiliary_a, (float *)auxiliary_a,
-                 (float *)auxiliary_b, deal_num);
-      __bang_and((float *)mask_bl, (float *)mask_bl, (float *)auxiliary_a,
-                 deal_num);
-      // mask_br :
-      // 0 <= coord_x_high < spatial_x && 0 <= coord_y_high < spatial_y
-      __bang_and((float *)mask_br, (float *)mask_br, (float *)auxiliary_a,
-                 deal_num);
-      // if loc has nan/inf, fill invalid value with 0.
-      // Note, althrough nan joins the compatution, the comparison returns
-      // normal value.
-      __bang_cycle_and((float *)mask_tl, (float *)mask_tl, (float *)valid_mask,
-                       4 * deal_num, deal_num);
-
-      // switch valid_mask to bit-type mask. 1 to 0xffffffff, 0 to 0x00000000
-      // first we cast float32 to int32. then multiply -1,
-      // whose hex is 0xffffffff
-      __bang_float2int32_rd((int32_t *)valid_mask, (float *)valid_mask,
-                            deal_num, 0);
-      __bang_mul_scalar((int32_t *)valid_mask, (int32_t *)valid_mask, -1,
-                        deal_num);
-
-      // calc inner point num
-      __bang_mul_scalar((float *)weight_tl, (float *)mask_tl, (float)7.0,
-                        deal_num);
-      __bang_mul_scalar((float *)weight_tr, (float *)mask_tr, (float)5.0,
-                        deal_num);
-      __bang_add((float *)weight_tl, (float *)weight_tl, (float *)weight_tr,
-                 deal_num);
-      __bang_mul_scalar((float *)weight_tr, (float *)mask_bl, (float)3.0,
-                        deal_num);
-      __bang_add((float *)point_ram, (float *)weight_tr, (float *)mask_br,
-                 deal_num);
-      __bang_add((float *)point_ram, (float *)point_ram, (float *)weight_tl,
-                 deal_num);
-      // calc interpolation weight
-      __bang_sub((float *)weight_bl, (float *)coord_x_low, (float *)coord_x,
-                 deal_num);
-      __bang_sub((float *)weight_br, (float *)coord_y_low, (float *)coord_y,
-                 deal_num);
-      __bang_add_scalar((float *)weight_bl, (float *)weight_bl, (float)1.0,
-                        deal_num);
-      __bang_add_scalar((float *)weight_br, (float *)weight_br, (float)1.0,
-                        deal_num);
-      __bang_sub((float *)weight_tl, (float *)coord_x, (float *)coord_x_low,
-                 deal_num);
-      __bang_sub((float *)weight_tr, (float *)coord_y, (float *)coord_y_low,
-                 deal_num);
-      __bang_mul((float *)input_tl, (float *)weight_bl, (float *)weight_br,
-                 deal_num);
-      __bang_mul((float *)input_tl + deal_num, (float *)weight_br,
-                 (float *)weight_tl, deal_num);
-      __bang_mul((float *)input_tl + 2 * deal_num, (float *)weight_bl,
-                 (float *)weight_tr, deal_num);
-      __bang_mul((float *)input_tl + 3 * deal_num, (float *)weight_tl,
-                 (float *)weight_tr, deal_num);
-      // if loc has nan/inf, fill all invalid potision with 0.
-      // Note that this operation handles in bit-scale.
-      __bang_cycle_band((char *)input_tl, (char *)input_tl, (char *)valid_mask,
-                        4 * deal_num * sizeof(float), deal_num * sizeof(float));
-      __sync();
-      // extend weight
-      const int32_t w_rep = channel / ELE_COUNT * ELE_COUNT;
-      const int32_t w_rem = channel % ELE_COUNT;
-      if (w_rem != 0) {
-        const int32_t data_sz = 1 * sizeof(float);
-        const int32_t dst_str = channel * sizeof(float);
-        for (int32_t iter = w_rep; iter < channel; ++iter) {
-          __memcpy_async((float *)weight_tl + iter, (float *)input_tl, data_sz,
-                         NRAM2NRAM, dst_str, data_sz, 4 * deal_num - 1);
-        }
-      }
-      if (w_rep != 0) {
-        for (int32_t i = 0; i < 4 * deal_num; i++) {
-          __bang_write_value((float *)weight_tl + i * channel, w_rep,
-                             ((float *)input_tl)[i]);
-        }
-      }
-      __sync();
-      const char *data_value_gdram_start =
-          data_value_gdram +
-          batch_idx * num_keys * num_heads * channels * sizeof(float);
-      const int32_t c_str = deal_num * channel * sizeof(float);
-      const int32_t cs_str = num_heads * channels * sizeof(float);
-      for (int32_t c_iter = 0; c_iter <= c_rep; ++c_iter) {
-        int32_t c_real_num = channel;
-        if (c_iter == c_rep) {
-          if (c_rem == 0) {
-            continue;
-          } else {
-            c_real_num = c_rem;
-          }
-        }
-        __bang_write_zero((float *)input_tl, 4 * deal_num * channel);
-        __sync();
-        // load data_value
-        for (int32_t p_idx = 0; p_idx < io_data_num; ++p_idx) {
-          const int32_t inner_point_num = (int32_t)((float *)point_ram)[p_idx];
-          const int32_t tl_offset = ((int32_t *)index_tl)[p_idx];
-          const int32_t bl_offset = ((int32_t *)index_bl)[p_idx];
-          const int32_t level_start_id =
-              ((int32_t *)data_level_start_index_nram)[(p_idx / num_points) %
-                                                       num_levels];
-#if MS_DEFORM_ATTN_FORWARD_HEADVECTOR
-          const char *data_value_ptr =
-              data_value_gdram_start +
-              (level_start_id * num_heads * channels + c_iter * channel) *
-                  sizeof(float);
-#else
-          const int32_t head_idx = ((p_idx + offset_g + grid_iter * deal_num) /
-                                    (num_levels * num_points)) %
-                                   num_heads;
-          const char *data_value_ptr =
-              data_value_gdram_start +
-              (level_start_id * num_heads * channels + head_idx * channels +
-               c_iter * channel) *
-                  sizeof(float);
-#endif
-          switch (inner_point_num) {
-            case 16:  // 4 points are cached.
-              __memcpy_async((float *)input_tl + p_idx * channel,
-                             (float *)data_value_ptr + tl_offset,
-                             c_real_num * sizeof(float), GDRAM2NRAM, c_str,
-                             cs_str, 1);
-              __memcpy_async((float *)input_bl + p_idx * channel,
-                             (float *)data_value_ptr + bl_offset,
-                             c_real_num * sizeof(float), GDRAM2NRAM, c_str,
-                             cs_str, 1);
-              break;
-            case 12:  // 2 points are cached. (top_left, top_right)
-              __memcpy_async((float *)input_tl + p_idx * channel,
-                             (float *)data_value_ptr + tl_offset,
-                             c_real_num * sizeof(float), GDRAM2NRAM, c_str,
-                             cs_str, 1);
-              break;
-            case 4:  // 2 points are cached. (bottom_left, bottom_right)
-              __memcpy_async((float *)input_bl + p_idx * channel,
-                             (float *)data_value_ptr + bl_offset,
-                             c_real_num * sizeof(float), GDRAM2NRAM, c_str,
-                             cs_str, 1);
-              break;
-            case 10:  // 2 points are cached. (top_left, bottom_left)
-              __memcpy_async((float *)input_tl + p_idx * channel,
-                             (float *)data_value_ptr + tl_offset,
-                             c_real_num * sizeof(float), GDRAM2NRAM);
-              __memcpy_async((float *)input_bl + p_idx * channel,
-                             (float *)data_value_ptr + bl_offset,
-                             c_real_num * sizeof(float), GDRAM2NRAM);
-              break;
-            case 6:  // 2 points are cached. (top_right, bottom_right)
-              __memcpy_async(
-                  (float *)input_tr + p_idx * channel,
-                  (float *)data_value_ptr + tl_offset + num_heads * channels,
-                  c_real_num * sizeof(float), GDRAM2NRAM);
-              __memcpy_async(
-                  (float *)input_br + p_idx * channel,
-                  (float *)data_value_ptr + bl_offset + num_heads * channels,
-                  c_real_num * sizeof(float), GDRAM2NRAM);
-              break;
-            case 7:  // 1 point is cached. (top_left)
-              __memcpy_async((float *)input_tl + p_idx * channel,
-                             (float *)data_value_ptr + tl_offset,
-                             c_real_num * sizeof(float), GDRAM2NRAM);
-              break;
-            case 5:  // 1 point is cached. (top_right)
-              __memcpy_async(
-                  (float *)input_tr + p_idx * channel,
-                  (float *)data_value_ptr + tl_offset + num_heads * channels,
-                  c_real_num * sizeof(float), GDRAM2NRAM);
-              break;
-            case 3:  // 1 point is cached. (bottom_left)
-              __memcpy_async((float *)input_bl + p_idx * channel,
-                             (float *)data_value_ptr + bl_offset,
-                             c_real_num * sizeof(float), GDRAM2NRAM);
-              break;
-            case 1:  // 1 point is cached. (bottom_right)
-              __memcpy_async(
-                  (float *)input_br + p_idx * channel,
-                  (float *)data_value_ptr + bl_offset + num_heads * channels,
-                  c_real_num * sizeof(float), GDRAM2NRAM);
-              break;
-            default:
-              continue;
-          }
-        }
-        __sync();
-        // interpolation
-        __bang_mul((float *)input_tl, (float *)input_tl, (float *)weight_tl,
-                   4 * deal_num * channel);
-        __bang_add((float *)input_tl, (float *)input_tl, (float *)input_bl,
-                   2 * deal_num * channel);
-        __bang_add((float *)input_tl, (float *)input_tl, (float *)input_tr,
-                   deal_num * channel);
-        // load attention weight
-        void *attn_weight = mask_tl;
-        __memcpy((float *)attn_weight,
-                 (float *)data_attn_weight_gdram + grid_off_base,
-                 io_data_num * sizeof(float), GDRAM2NRAM);
-        // calc data_col, muladd attention weight
-        __bang_transpose((float *)input_tr, (float *)input_tl, deal_num,
-                         channel);
-        __bang_cycle_mul((float *)input_tr, (float *)input_tr,
-                         (float *)attn_weight, deal_num * channel, deal_num);
-        __bang_transpose((float *)input_tl, (float *)input_tr, channel,
-                         deal_num);
-        __bang_sumpool((float *)input_bl, (float *)input_tl, channel, 1,
-                       io_data_num, 1, num_levels * num_points,
-                       num_levels * num_points, 1);
-        // store
-        __memcpy((float *)data_col_gdram_start + c_iter * channel,
-                 (float *)input_bl, c_real_num * sizeof(float), NRAM2GDRAM,
-                 channels * sizeof(float), channel * sizeof(float),
-                 (io_data_num / (num_levels * num_points)) - 1);
-      }
-    }
-  }
-  __sync();
-  return;
-#endif
-}
-
-template __mlu_global__ void MLUKernelMsDeformAttnForwardSmallChannel<float>(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram);
diff --git a/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu b/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu
deleted file mode 100644
index 18ec006f5..000000000
--- a/kernels/ms_deform_attn_forward/msda_forward_union1_default.mlu
+++ /dev/null
@@ -1,484 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <math.h>
-
-#include "kernels/ms_deform_attn_forward/ms_deform_attn_forward.h"
-
-#define TWELVE_SPLIT 12
-#define ELE_COUNT 32 /* cycle element count */
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void loadNeighborPointsData(
-    const T *data_value_gdram, T *data_value_p1_nram, T *data_value_p2_nram,
-    T *data_value_p3_nram, T *data_value_p4_nram, const size_t &deal_num,
-    const int32_t &width, const int32_t &height, const int32_t &num_heads,
-    const int32_t &channels, const T &x, const T &y, const int32_t &head_idx) {
-  const int32_t w_low = floorf(x);
-  const int32_t h_low = floorf(y);
-  const int32_t w_high = w_low + 1;
-  const int32_t h_high = h_low + 1;
-  const int32_t w_stride = num_heads * channels;
-  const int32_t h_stride = width * w_stride;
-  const int32_t h_low_ptr_offset = h_low * h_stride;
-  const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int32_t w_low_ptr_offset = w_low * w_stride;
-  const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int32_t base_ptr_offset = head_idx * channels;
-  // top-left point
-  if (h_low >= 0 && w_low >= 0) {
-    const int32_t v1_offset =
-        h_low_ptr_offset + w_low_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p1_nram, data_value_gdram + v1_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-  // top-right point
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int32_t v2_offset =
-        h_low_ptr_offset + w_high_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p2_nram, data_value_gdram + v2_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-  // bottom-left point
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int32_t v3_offset =
-        h_high_ptr_offset + w_low_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p3_nram, data_value_gdram + v3_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-  // bottom-right point
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int32_t v4_offset =
-        h_high_ptr_offset + w_high_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p4_nram, data_value_gdram + v4_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-}
-
-template <typename T>
-__mlu_func__ void computeMsDeformAttn(
-    T *data_value_p1_nram, T *data_value_p2_nram, T *data_value_p3_nram,
-    T *data_value_p4_nram, T *sample_point_value, T *auxiliary_b,
-    T *data_col_nram, const T &weight, const size_t &deal_num,
-    const int32_t &width, const int32_t &height, const T &x, const T &y) {
-  const int32_t w_low = floorf(x);
-  const int32_t h_low = floorf(y);
-  const int32_t w_high = w_low + 1;
-  const int32_t h_high = h_low + 1;
-  const T lw = x - w_low;
-  const T lh = y - h_low;
-  const T hw = 1 - lw;
-  const T hh = 1 - lh;
-  const T w1 = hh * hw;
-  const T w2 = hh * lw;
-  const T w3 = lh * hw;
-  const T w4 = lh * lw;
-
-  __bang_write_value((T *)sample_point_value, deal_num, (T)0);
-
-  // top-left point
-  if (h_low >= 0 && w_low >= 0) {
-    // sample_point_value += v1 * w1
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p1_nram, (T)w1,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-  // top-right point
-  if (h_low >= 0 && w_high <= width - 1) {
-    // sample_point_value += v2 * w2
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p2_nram, (T)w2,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-  // bottom-left point
-  if (h_high <= height - 1 && w_low >= 0) {
-    // sample_point_value += v3 * w3
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p3_nram, (T)w3,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-  // bottom-right point
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    // sample_point_value += v4 * w4
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p4_nram, (T)w4,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-  __bang_mul_scalar((T *)sample_point_value, (T *)sample_point_value, (T)weight,
-                    deal_num);
-  __bang_add((T *)data_col_nram, (T *)data_col_nram, (T *)sample_point_value,
-             deal_num);
-}
-
-template <typename T>
-__mlu_global__ void MLUKernelMsDeformAttnForwardDefault(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram) {
-  if (__is_mpu()) {
-    return;
-  }
-  const size_t spatial_size = PAD_UP(2 * sizeof(int32_t), NFU_ALIGN_SIZE);
-  const size_t span_num_deal =
-      PAD_DOWN((MAX_NRAM_SIZE - spatial_size) / TWELVE_SPLIT / sizeof(T),
-               NFU_ALIGN_SIZE);
-  const size_t align_num = NFU_ALIGN_SIZE;
-  const int32_t channels_seg_num = channels / span_num_deal;
-  const size_t channels_rem = channels % span_num_deal;
-  const size_t channels_align_rem = CEIL_ALIGN(channels_rem, align_num);
-  char *data_spatial_shapes_nram = nram_buffer;
-  char *ping_data_value_p1_nram = data_spatial_shapes_nram + spatial_size;
-  char *ping_data_value_p2_nram =
-      ping_data_value_p1_nram + span_num_deal * sizeof(T);
-  char *ping_data_value_p3_nram =
-      ping_data_value_p2_nram + span_num_deal * sizeof(T);
-  char *ping_data_value_p4_nram =
-      ping_data_value_p3_nram + span_num_deal * sizeof(T);
-  char *ping_data_col_nram =
-      ping_data_value_p4_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p1_nram =
-      ping_data_col_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p2_nram =
-      pong_data_value_p1_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p3_nram =
-      pong_data_value_p2_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p4_nram =
-      pong_data_value_p3_nram + span_num_deal * sizeof(T);
-  char *pong_data_col_nram =
-      pong_data_value_p4_nram + span_num_deal * sizeof(T);
-  char *auxiliary_a = pong_data_col_nram + span_num_deal * sizeof(T);
-  char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
-  const size_t ping_pong_gap = 5 * span_num_deal * sizeof(T);
-  size_t data_col_ping_pong_idx = 0;
-  int32_t block_num_per_core = (batch_size * num_queries * num_heads) / taskDim;
-  const int32_t block_num_rem =
-      (batch_size * num_queries * num_heads) % taskDim;
-  const int32_t idx_start = taskId < (block_num_rem + 1)
-                                ? taskId * (block_num_per_core + 1)
-                                : taskId * block_num_per_core + block_num_rem;
-  block_num_per_core =
-      taskId < block_num_rem
-          ? (batch_size * num_queries * num_heads) / taskDim + 1
-          : (batch_size * num_queries * num_heads) / taskDim;
-  for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core;
-       ++cur_idx) {
-    /*
-      cur_idx = batch_idx * num_queries * num_heads +
-                query_idx * num_heads + head_idx
-    */
-    const int32_t head_idx = cur_idx % num_heads;
-    const int32_t batch_idx = (cur_idx / num_heads) / num_queries;
-    const char *data_value_gdram_start =
-        data_value_gdram +
-        batch_idx * num_keys * num_heads * channels * sizeof(T);
-    const char *data_sampling_loc_gdram_start =
-        data_sampling_loc_gdram +
-        cur_idx * num_levels * num_points * 2 * sizeof(T);
-    const char *data_attn_weight_gdram_start =
-        data_attn_weight_gdram + cur_idx * num_levels * num_points * sizeof(T);
-    char *data_col_gdram_start =
-        data_col_gdram + cur_idx * channels * sizeof(T);
-    for (int32_t c_seg_idx = 0; c_seg_idx < channels_seg_num; ++c_seg_idx) {
-      __bang_write_value(
-          (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
-          span_num_deal, (T)0);
-      // load data
-      // level_idx = 0, point_idx = 0
-      __memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
-               2 * sizeof(int32_t), GDRAM2NRAM);
-      int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
-      int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
-      const char *data_value_ptr =
-          data_value_gdram_start + c_seg_idx * span_num_deal * sizeof(T);
-      T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
-      T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
-      T weight = ((T *)data_attn_weight_gdram_start)[0];
-      T x = loc_w * spatial_w - 0.5;
-      T y = loc_h * spatial_h - 0.5;
-      if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-        loadNeighborPointsData(
-            (T *)data_value_ptr, (T *)ping_data_value_p1_nram,
-            (T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
-            (T *)ping_data_value_p4_nram, span_num_deal, spatial_w, spatial_h,
-            num_heads, channels, x, y, head_idx);
-      }
-      T spatial_h_next_point = 0;
-      T spatial_w_next_point = 0;
-      T weight_next_point = 0;
-      T x_next_point = 0;
-      T y_next_point = 0;
-      __sync();
-      for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
-        for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
-          // load data
-          if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
-            // last point no need to load data, continue to compute
-          } else if (point_idx == num_points - 1) {
-            const int32_t level_start_id =
-                ((int32_t *)data_level_start_index_gdram)[level_idx + 1];
-            const int32_t spatial_h_ptr = (level_idx + 1) << 1;
-            __memcpy(
-                data_spatial_shapes_nram,
-                data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
-                2 * sizeof(int32_t), GDRAM2NRAM);
-            spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
-            spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
-            data_value_ptr = data_value_gdram_start +
-                             (level_start_id * num_heads * channels +
-                              c_seg_idx * span_num_deal) *
-                                 sizeof(T);
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w_next_point - 0.5;
-            y_next_point = loc_h * spatial_h_next_point - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h_next_point &&
-                x_next_point < spatial_w_next_point) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  span_num_deal, spatial_w_next_point, spatial_h_next_point,
-                  num_heads, channels, x_next_point, y_next_point, head_idx);
-            }
-          } else {
-            spatial_h_next_point = spatial_h;
-            spatial_w_next_point = spatial_w;
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w - 0.5;
-            y_next_point = loc_h * spatial_h - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h && x_next_point < spatial_w) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  span_num_deal, spatial_w, spatial_h, num_heads, channels,
-                  x_next_point, y_next_point, head_idx);
-            }
-          }
-          // compute
-          if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-            computeMsDeformAttn(
-                (T *)(ping_data_value_p1_nram +
-                      ((level_idx * num_points + point_idx) % 2) * ping_pong_gap),  // NOLINT
-                (T *)(ping_data_value_p2_nram +
-                      ((level_idx * num_points + point_idx) % 2) * ping_pong_gap),  // NOLINT
-                (T *)(ping_data_value_p3_nram +
-                      ((level_idx * num_points + point_idx) % 2) * ping_pong_gap),  // NOLINT
-                (T *)(ping_data_value_p4_nram +
-                      ((level_idx * num_points + point_idx) % 2) * ping_pong_gap),  // NOLINT
-                (T *)auxiliary_a, (T *)auxiliary_b,
-                (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),  // NOLINT
-                weight, span_num_deal, spatial_w, spatial_h, x, y);
-          }
-          spatial_w = spatial_w_next_point;
-          spatial_h = spatial_h_next_point;
-          weight = weight_next_point;
-          x = x_next_point;
-          y = y_next_point;
-          __sync();
-        }
-      }
-      // store
-      __memcpy_async(
-          data_col_gdram_start + c_seg_idx * span_num_deal * sizeof(T),
-          ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
-          span_num_deal * sizeof(T), NRAM2GDRAM);
-      data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
-    }
-    if (channels_rem > 0) {
-      __bang_write_value(
-          (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
-          channels_align_rem, (T)0);
-      // load data
-      // level_idx = 0, point_idx = 0
-      __memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
-               2 * sizeof(int32_t), GDRAM2NRAM);
-      int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
-      int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
-      const char *data_value_ptr =
-          data_value_gdram_start + channels_seg_num * span_num_deal * sizeof(T);
-      T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
-      T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
-      T weight = ((T *)data_attn_weight_gdram_start)[0];
-      T x = loc_w * spatial_w - 0.5;
-      T y = loc_h * spatial_h - 0.5;
-      if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-        loadNeighborPointsData(
-            (T *)data_value_ptr, (T *)ping_data_value_p1_nram,
-            (T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
-            (T *)ping_data_value_p4_nram, channels_rem, spatial_w, spatial_h,
-            num_heads, channels, x, y, head_idx);
-      }
-      T spatial_h_next_point = 0;
-      T spatial_w_next_point = 0;
-      T weight_next_point = 0;
-      T x_next_point = 0;
-      T y_next_point = 0;
-      __sync();
-      for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
-        for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
-          // load data
-          if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
-            // last point no need to load data, continue to compute
-          } else if (point_idx == num_points - 1) {
-            const int32_t level_start_id =
-                ((int32_t *)data_level_start_index_gdram)[level_idx + 1];
-            const int32_t spatial_h_ptr = (level_idx + 1) << 1;
-            __memcpy(
-                data_spatial_shapes_nram,
-                data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
-                2 * sizeof(int32_t), GDRAM2NRAM);
-            spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
-            spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
-            data_value_ptr = data_value_gdram_start +
-                             (level_start_id * num_heads * channels +
-                              channels_seg_num * span_num_deal) *
-                                 sizeof(T);
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w_next_point - 0.5;
-            y_next_point = loc_h * spatial_h_next_point - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h_next_point &&
-                x_next_point < spatial_w_next_point) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  channels_rem, spatial_w_next_point, spatial_h_next_point,
-                  num_heads, channels, x_next_point, y_next_point, head_idx);
-            }
-          } else {
-            spatial_w_next_point = spatial_w;
-            spatial_h_next_point = spatial_h;
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w - 0.5;
-            y_next_point = loc_h * spatial_h - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h && x_next_point < spatial_w) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) * ping_pong_gap),  // NOLINT
-                  channels_rem, spatial_w, spatial_h, num_heads, channels,
-                  x_next_point, y_next_point, head_idx);
-            }
-          }
-          // compute
-          if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-            computeMsDeformAttn(
-                (T *)(ping_data_value_p1_nram +
-                      ((level_idx * num_points + point_idx) % 2) * ping_pong_gap),  // NOLINT
-                (T *)(ping_data_value_p2_nram +
-                      ((level_idx * num_points + point_idx) % 2) * ping_pong_gap),  // NOLINT
-                (T *)(ping_data_value_p3_nram +
-                      ((level_idx * num_points + point_idx) % 2) * ping_pong_gap),  // NOLINT
-                (T *)(ping_data_value_p4_nram +
-                      ((level_idx * num_points + point_idx) % 2) * ping_pong_gap),  // NOLINT
-                (T *)auxiliary_a, (T *)auxiliary_b,
-                (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),  // NOLINT
-                weight, channels_align_rem, spatial_w, spatial_h, x, y);
-          }
-          spatial_w = spatial_w_next_point;
-          spatial_h = spatial_h_next_point;
-          weight = weight_next_point;
-          x = x_next_point;
-          y = y_next_point;
-          __sync();
-        }
-      }
-      // store
-      __memcpy_async(
-          data_col_gdram_start + channels_seg_num * span_num_deal * sizeof(T),
-          ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
-          channels_rem * sizeof(T), NRAM2GDRAM);
-      data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
-    }
-  }
-  __sync();
-  return;
-}
-
-template __mlu_global__ void MLUKernelMsDeformAttnForwardDefault<float>(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram);
diff --git a/kernels/mutual_information_backward/mutual_information_backward.cpp b/kernels/mutual_information_backward/mutual_information_backward.cpp
deleted file mode 100644
index e4a6883d2..000000000
--- a/kernels/mutual_information_backward/mutual_information_backward.cpp
+++ /dev/null
@@ -1,863 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "mutual_information_backward.h"
-
-#include <algorithm>
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"
-#include "kernels/utils/cnnl_helper.h"
-
-#define API_NAME "[mluOpMutualInformationBackward]"
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetMutualInformationBackwardWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc,
-    const mluOpTensorDescriptor_t ans_grad_desc, const bool overwrite_ans_grad,
-    size_t *workspace_size) {
-  PARAM_CHECK(API_NAME, handle != nullptr);
-  PARAM_CHECK(API_NAME, px_desc != nullptr);
-  PARAM_CHECK(API_NAME, py_desc != nullptr);
-  PARAM_CHECK(API_NAME, p_desc != nullptr);
-  PARAM_CHECK(API_NAME, ans_grad_desc != nullptr);
-  PARAM_CHECK(API_NAME, workspace_size != nullptr);
-  // Use for p_grad size, only support float data type now
-  *workspace_size = mluOpGetTensorElementNum(p_desc) * sizeof(float);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorDim(
-    const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc,
-    const mluOpTensorDescriptor_t ans_grad_desc,
-    const mluOpTensorDescriptor_t px_grad_desc,
-    const mluOpTensorDescriptor_t py_grad_desc) {
-  if (3 != px_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of px must be 3. "
-               << "But now the dim of px is " << px_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (3 != py_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of py must be 3. "
-               << "But now the dim of py is " << py_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (nullptr != opt_boundary_desc && 2 != opt_boundary_desc->dim) {
-    LOG(ERROR) << API_NAME
-               << " The dim of opt_boundary must be 2 when opt_boundary is "
-               << "not NULL. But now the dim of opt_boundary is "
-               << opt_boundary_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (3 != p_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of p must be 3. "
-               << "But now the dim of p is " << p_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (1 != ans_grad_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of ans_grad must be 1. "
-               << "But now the dim of ans_grad is " << ans_grad_desc->dim
-               << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (3 != px_grad_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of px_grad must be 3. "
-               << "But now the dim of px_grad is " << px_grad_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (3 != py_grad_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of py_grad must be 3. "
-               << "But now the dim of py_grad is " << py_grad_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorShape(
-    const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc,
-    const mluOpTensorDescriptor_t ans_grad_desc,
-    const mluOpTensorDescriptor_t px_grad_desc,
-    const mluOpTensorDescriptor_t py_grad_desc) {
-  const int B = px_desc->dims[0];
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-  if (B != py_desc->dims[0] || B != p_desc->dims[0] ||
-      B != ans_grad_desc->dims[0] || B != px_grad_desc->dims[0] ||
-      B != py_grad_desc->dims[0]) {
-    LOG(ERROR) << API_NAME
-               << " px.shape[0], py.shape[0], p.shape[0], ans_grad.shape[0], "
-               << "px_grad.shape[0] and py_grad.shape[0] must be same. But now "
-               << "px.shape[0] is " << px_desc->dims[0] << ", py.shape[0] is "
-               << py_desc->dims[0] << ", p.shape[0] is " << p_desc->dims[0]
-               << ", ans_grad.shape[0] is " << ans_grad_desc->dims[0]
-               << ", px_grad.shape[0] is " << px_grad_desc->dims[0]
-               << ", py_grad.shape[0] is " << py_grad_desc->dims[0] << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // Currently only supports !modified, so the shape of px must be [B, S, T+1]
-  if (T + 1 != px_desc->dims[2]) {
-    LOG(ERROR) << API_NAME << " Currently only supports the case that "
-               << "px.shape[2] must be equal to py.shape[2] + 1. But now "
-               << "px.shape[2] is " << px_desc->dims[2] << ", py.shape[2] is "
-               << py_desc->dims[2] << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  // The shape of py must be [B, S+1, T]
-  if (S + 1 != py_desc->dims[1]) {
-    LOG(ERROR) << API_NAME << " py.shape[1] must be equal to px.shape[1] + 1. "
-               << "But now px.shape[1] is " << px_desc->dims[1]
-               << ", py.shape[1] is " << py_desc->dims[1] << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // The shape of opt_boundary must be [B, 4]
-  if (nullptr != opt_boundary_desc &&
-      (B != opt_boundary_desc->dims[0] || 4 != opt_boundary_desc->dims[1])) {
-    LOG(ERROR) << API_NAME << " When opt_boundary is not NULL, "
-               << "opt_boundary.shape[0] and px.shape[0] must be same, and "
-               << "opt_boundary.shape[1] must be 4. But now "
-               << "px.shape[0] is " << px_desc->dims[0]
-               << ", opt_boundary.shape[0] is " << opt_boundary_desc->dims[0]
-               << ", opt_boundary.shape[1] is " << opt_boundary_desc->dims[1]
-               << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // The shape of p must be [B, S+1, T+1]
-  if (S + 1 != p_desc->dims[1] || T + 1 != p_desc->dims[2]) {
-    LOG(ERROR) << API_NAME << " p.shape[1] and py.shape[1] must be same, and "
-               << "p.shape[2] must be equal to py.shape[2] + 1. "
-               << "But now p.shape[1] is " << p_desc->dims[1]
-               << ", py.shape[1] is " << py_desc->dims[1] << ", p.shape[2] is "
-               << p_desc->dims[2] << ", py.shape[2] is " << py_desc->dims[2]
-               << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // The shape of px and px_grad must be same: [B, S, T+1]
-  for (int i = 1; i < px_grad_desc->dim; ++i) {
-    if (px_grad_desc->dims[i] != px_desc->dims[i]) {
-      LOG(ERROR) << API_NAME
-                 << " The shape of px and px_grad must be same. But now "
-                 << "px.shape[" << i << "] is " << px_desc->dims[i]
-                 << ", px_grad.shape[" << i << "] is " << px_grad_desc->dims[i]
-                 << ".";
-      return MLUOP_STATUS_BAD_PARAM;
-    }
-  }
-
-  // The shape of py and py_grad must be same: [B, S+1, T]
-  for (int i = 1; i < py_grad_desc->dim; ++i) {
-    if (py_grad_desc->dims[i] != py_desc->dims[i]) {
-      LOG(ERROR) << API_NAME
-                 << " The shape of py and py_grad must be same. But now "
-                 << "py.shape[" << i << "] is " << py_desc->dims[i]
-                 << ", py_grad.shape[" << i << "] is " << py_grad_desc->dims[i]
-                 << ".";
-      return MLUOP_STATUS_BAD_PARAM;
-    }
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorDatatype(
-    const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc,
-    const mluOpTensorDescriptor_t ans_grad_desc,
-    const mluOpTensorDescriptor_t px_grad_desc,
-    const mluOpTensorDescriptor_t py_grad_desc) {
-  if (MLUOP_DTYPE_FLOAT != px_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of px currently only support float. But now "
-               << "the data type of px is "
-               << mluOpGetNameOfDataType(px_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (MLUOP_DTYPE_FLOAT != py_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of py currently only support float. But now "
-               << "the data type of py is "
-               << mluOpGetNameOfDataType(py_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (nullptr != opt_boundary_desc &&
-      MLUOP_DTYPE_INT64 != opt_boundary_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of opt_boundary currently only support int64."
-               << " But now the data type of opt_boundary is "
-               << mluOpGetNameOfDataType(opt_boundary_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (MLUOP_DTYPE_FLOAT != p_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of p currently only support float. But now "
-               << "the data type of p is "
-               << mluOpGetNameOfDataType(p_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (MLUOP_DTYPE_FLOAT != ans_grad_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of ans_grad currently only support float. "
-               << "But now the data type of ans_grad is "
-               << mluOpGetNameOfDataType(ans_grad_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (MLUOP_DTYPE_FLOAT != px_grad_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of px_grad currently only support float. "
-               << "But now the data type of px_grad is "
-               << mluOpGetNameOfDataType(px_grad_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (MLUOP_DTYPE_FLOAT != py_grad_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of py_grad currently only support float. "
-               << "But now the data type of py_grad is "
-               << mluOpGetNameOfDataType(py_grad_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorScaleLimit(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc) {
-  // Check large tensor
-  // The shape of px and px_grad are the same,
-  // The shape of py and py_grad are the same,
-  // So there is no need to check the tensor num of px_grad and py_grad
-  if (mluOpGetTensorElementNum(px_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(py_desc) >= LARGE_TENSOR_NUM ||
-      (nullptr != opt_boundary_desc &&
-       mluOpGetTensorElementNum(opt_boundary_desc) >= LARGE_TENSOR_NUM) ||
-      mluOpGetTensorElementNum(p_desc) >= LARGE_TENSOR_NUM) {
-    LOG(ERROR) << API_NAME << " Overflow max tensor num."
-               << " Current operator supports tensor num smaller than 2^31.";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorPtr(
-    const void *px, const void *py, const void *p, const void *ans_grad,
-    const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary,
-    const void *px_grad, const void *py_grad, const int S, const int T,
-    bool &has_boundary) {
-  if (S > 0) {
-    PARAM_CHECK(API_NAME, px != nullptr);
-    PARAM_CHECK(API_NAME, px_grad != nullptr);
-  } else {
-    VLOG(5) << API_NAME << " px.shape[1] is zero.";
-  }
-
-  if (T > 0) {
-    PARAM_CHECK(API_NAME, py != nullptr);
-    PARAM_CHECK(API_NAME, py_grad != nullptr);
-  } else {
-    VLOG(5) << API_NAME << " py.shape[2] is zero.";
-  }
-
-  PARAM_CHECK(API_NAME, p != nullptr);
-  PARAM_CHECK(API_NAME, ans_grad != nullptr);
-
-  if (nullptr != opt_boundary_desc && nullptr != opt_boundary) {
-    has_boundary = true;
-    VLOG(5) << API_NAME << " opt_boundary is not NULL.";
-
-  } else if (nullptr == opt_boundary_desc && nullptr == opt_boundary) {
-    has_boundary = false;
-    VLOG(5) << API_NAME << " opt_boundary is NULL.";
-  } else {
-    LOG(ERROR) << API_NAME
-               << " opt_boundary_desc and opt_boundary must both be NULL, "
-               << "or both not be NULL.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t mutualInformationBackwardParamCheck(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary,
-    const mluOpTensorDescriptor_t p_desc, const void *p,
-    const mluOpTensorDescriptor_t ans_grad_desc, void *ans_grad,
-    void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t px_grad_desc, void *px_grad,
-    const mluOpTensorDescriptor_t py_grad_desc, void *py_grad,
-    bool &has_boundary, bool &zero_element) {
-  // 1. check handle and tensor_desc
-  PARAM_CHECK(API_NAME, handle != nullptr);
-  PARAM_CHECK(API_NAME, px_desc != nullptr);
-  PARAM_CHECK(API_NAME, py_desc != nullptr);
-  PARAM_CHECK(API_NAME, p_desc != nullptr);
-  PARAM_CHECK(API_NAME, ans_grad_desc != nullptr);
-  PARAM_CHECK(API_NAME, px_grad_desc != nullptr);
-  PARAM_CHECK(API_NAME, py_grad_desc != nullptr);
-
-  // Since the layout of all tensors are ARRAY, so skip check tensor layout
-
-  // 2. check mlu platform
-  if (handle->arch < 372) {
-    LOG(ERROR) << API_NAME << " Only mlu300 and above devices are supported."
-               << " Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-
-  // 3. check tensor dim
-  mluOpStatus_t check_status =
-      checkTensorDim(px_desc, py_desc, opt_boundary_desc, p_desc, ans_grad_desc,
-                     px_grad_desc, py_grad_desc);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  // 4. check tensor shape
-  check_status = checkTensorShape(px_desc, py_desc, opt_boundary_desc, p_desc,
-                                  ans_grad_desc, px_grad_desc, py_grad_desc);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  // 5. check tensor dtype
-  check_status =
-      checkTensorDatatype(px_desc, py_desc, opt_boundary_desc, p_desc,
-                          ans_grad_desc, px_grad_desc, py_grad_desc);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  // 6. check scale limit, for large tensor
-  check_status = checkTensorScaleLimit(handle, px_desc, py_desc,
-                                       opt_boundary_desc, p_desc);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  const int B = px_desc->dims[0];
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-
-  // 7. check zero element.
-  if (0 == B || (0 == S && 0 == T)) {
-    zero_element = true;
-    VLOG(5) << API_NAME << " Skip zero element tensor when px.shape[0] is zero "
-            << "or px.shape[1] and py.shape[2] are both zero.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // 8 check workspace
-  if (workspace_size > 0) {
-    PARAM_CHECK(API_NAME, workspace != nullptr);
-  }
-
-  // 9. check tensor ptr
-  check_status =
-      checkTensorPtr(px, py, p, ans_grad, opt_boundary_desc, opt_boundary,
-                     px_grad, py_grad, S, T, has_boundary);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static void mutualInformationBackwardGencase(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary,
-    const mluOpTensorDescriptor_t p_desc, const void *p,
-    const mluOpTensorDescriptor_t ans_grad_desc, void *ans_grad,
-    const bool overwrite_ans_grad, const mluOpTensorDescriptor_t px_grad_desc,
-    void *px_grad, const mluOpTensorDescriptor_t py_grad_desc, void *py_grad) {
-  GEN_CASE_START("mutual_information_backward");
-  GEN_CASE_HANDLE(handle);
-
-  GEN_CASE_DATA(true, "px", px, px_desc, -1, 1);
-  GEN_CASE_DATA(true, "py", py, py_desc, -1, 1);
-  if (nullptr != opt_boundary) {
-    GEN_CASE_DATA_REAL(true, "opt_boundary", opt_boundary, opt_boundary_desc);
-  }
-  GEN_CASE_DATA(true, "p", p, p_desc, -1, 1);
-  GEN_CASE_DATA(true, "ans_grad", ans_grad, ans_grad_desc, -1, 1);
-  GEN_CASE_DATA(false, "ans_grad", ans_grad, ans_grad_desc, -1, 1);
-  GEN_CASE_DATA(false, "px_grad", px_grad, px_grad_desc, -1, 1);
-  GEN_CASE_DATA(false, "py_grad", py_grad, py_grad_desc, -1, 1);
-
-  GEN_CASE_OP_PARAM_SINGLE(0, "mutual_information_backward",
-                           "overwrite_ans_grad", overwrite_ans_grad);
-  GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-}
-
-static void policyFunc3Pipeline(const mluOpHandle_t handle, cnrtDim3_t *k_dim,
-                                cnrtFunctionType_t *k_type, int batch_size) {
-  int core_num = mluop::runtime::getClusterLimitCapability(handle) *
-                 mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  *k_type = CNRT_FUNC_TYPE_BLOCK;
-  k_dim->x = 1;
-  k_dim->y = batch_size < core_num ? batch_size : core_num;
-  k_dim->z = 1;
-}
-
-static mluOpStatus_t launchMutualInformationBackward3PipelineKernel(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const bool has_boundary, const void *opt_boundary, const void *p,
-    const bool overwrite_ans_grad, void *ans_grad, void *px_grad,
-    void *py_grad) {
-  const int B = px_desc->dims[0];
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc3Pipeline(handle, &k_dim, &k_type, B);
-  VLOG(5) << "Launch Kernel 3PipelineMutualInformationBackward<<<Block "
-          << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-  CHECK_RETURN(
-      "[MutualInformationBackward]",
-      kernel3PipelineMutualInformationBackward(
-          k_dim, k_type, handle->queue, B, S, T, px, py, has_boundary,
-          opt_boundary, p, overwrite_ans_grad, ans_grad, px_grad, py_grad));
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-// Calculate computing diagonal number of partition mode for default kernel
-static void calComputingDiags(const int S, const int T,
-                              int64_t *computing_diag_num, int *s_block_size,
-                              int *t_block_size, int *s_repeat, int *t_repeat,
-                              int *s_remainder, int *t_remainder,
-                              const int mode) {
-  // If has remainder part, rearrange block size to balance work load
-  s_repeat[mode] = S / s_block_size[mode];
-  s_remainder[mode] = S % s_block_size[mode];
-  if (s_remainder[mode] > 0) {
-    s_block_size[mode] = S / (s_repeat[mode] + 1);
-    s_repeat[mode] = S / s_block_size[mode];
-    s_remainder[mode] = S % s_block_size[mode];
-  }
-
-  t_repeat[mode] = T / t_block_size[mode];
-  t_remainder[mode] = T % t_block_size[mode];
-  if (t_remainder[mode] > 0) {
-    t_block_size[mode] = T / (t_repeat[mode] + 1);
-    t_repeat[mode] = T / t_block_size[mode];
-    t_remainder[mode] = T % t_block_size[mode];
-  }
-
-  // Accumulate all block's computing diagonal numbers
-  computing_diag_num[mode] = s_repeat[mode] * t_repeat[mode] *
-                             (s_block_size[mode] + t_block_size[mode] - 1);
-  if (s_remainder[mode] > 0) {
-    computing_diag_num[mode] +=
-        t_repeat[mode] * (t_block_size[mode] + s_remainder[mode] - 1);
-  }
-
-  if (t_remainder[mode] > 0) {
-    computing_diag_num[mode] +=
-        s_repeat[mode] * (s_block_size[mode] + t_remainder[mode] - 1);
-  }
-
-  if (s_remainder[mode] > 0 && t_remainder[mode] > 0) {
-    computing_diag_num[mode] += s_remainder[mode] + t_remainder[mode] - 1;
-  }
-}
-
-static void assignPartitionParams(const int *s_block_size,
-                                  const int *t_block_size, const int *s_repeat,
-                                  const int *t_repeat, const int *s_remainder,
-                                  const int *t_remainder,
-                                  int &final_s_block_size,
-                                  int &final_t_block_size, int &final_s_repeat,
-                                  int &final_t_repeat, int &final_s_remainder,
-                                  int &final_t_remainder, const int mode) {
-  final_s_block_size = s_block_size[mode];
-  final_t_block_size = t_block_size[mode];
-  final_s_repeat = s_repeat[mode];
-  final_t_repeat = t_repeat[mode];
-  final_s_remainder = s_remainder[mode];
-  final_t_remainder = t_remainder[mode];
-}
-
-static void calDefaultPartition(const int S, const int T, const int N_size,
-                                const int nram_size, int &job_diag_num,
-                                int &final_s_block_size,
-                                int &final_t_block_size, int &final_s_repeat,
-                                int &final_t_repeat, int &final_s_remainder,
-                                int &final_t_remainder) {
-  // Compute each partition's job diagonal number,
-  // and choose the partition method with the least job diagonal number:
-  // 1) all S and T, no partition, launch once in one batch;
-  // 2) S < max_N_size, compare with (S, t) and (S/2, t);
-  // 3) T < max_N_size, compare with (s, T) and (s, T/2);
-  // 4) both S and T > max_N_size, compare with (N, N), (S, t), (s, T), if
-  // exist;
-  if (S <= N_size && T <= N_size) {
-    // once can compute all SxT onchip
-    job_diag_num = 1;
-    final_s_block_size = S;
-    final_t_block_size = T;
-    final_s_repeat = 1;
-    final_t_repeat = 1;
-    final_s_remainder = 0;
-    final_t_remainder = 0;
-    return;
-  } else {
-    // Sum of each partition's number of computing diagonals
-    // at most 3 arrays of candidate partition mode
-    int mode;
-    int64_t computing_diag_num[3] = {0};
-    int s_block_size[3] = {0};
-    int t_block_size[3] = {0};
-    int s_repeat[3] = {0};
-    int t_repeat[3] = {0};
-    int s_remainder[3] = {0};
-    int t_remainder[3] = {0};
-
-    if (S <= N_size && T > N_size) {
-      // compare with (S, t) and (S/2, t)
-      // 1) deal_s = S; min(s, t) = s;
-      mode = 0;
-      s_block_size[0] = S;
-      t_block_size[0] = (nram_size / sizeof(float) - 8 * s_block_size[0]) /
-                        (4 * s_block_size[0] + 2);
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      // 2) deal_s = S/2; min(s, t) = s;
-      mode = 1;
-      s_block_size[1] = std::max(S / 2, 1);  // at least 1 number in s_block
-      t_block_size[1] = (nram_size / sizeof(float) - 8 * s_block_size[1]) /
-                        (4 * s_block_size[1] + 2);
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-
-      if (computing_diag_num[0] <= computing_diag_num[1]) {
-        assignPartitionParams(
-            s_block_size, t_block_size, s_repeat, t_repeat, s_remainder,
-            t_remainder, final_s_block_size, final_t_block_size, final_s_repeat,
-            final_t_repeat, final_s_remainder, final_t_remainder, 0);
-      } else {
-        assignPartitionParams(
-            s_block_size, t_block_size, s_repeat, t_repeat, s_remainder,
-            t_remainder, final_s_block_size, final_t_block_size, final_s_repeat,
-            final_t_repeat, final_s_remainder, final_t_remainder, 1);
-      }
-    } else if (S > N_size && T <= N_size) {
-      // compare with (s, T) and (s, T/2)
-      // 1) deal_t = T; min(s, t) = t;
-      mode = 0;
-      t_block_size[0] = T;
-      s_block_size[0] = (nram_size / sizeof(float) - 8 * t_block_size[0]) /
-                        (4 * t_block_size[0] + 2);
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      // 2) deal_t = T/2; min(s, t) = t;
-      mode = 1;
-      t_block_size[1] = std::max(T / 2, 1);  // at least 1 number in t_block
-      s_block_size[1] = (nram_size / sizeof(float) - 8 * t_block_size[1]) /
-                        (4 * t_block_size[1] + 2);
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-
-      if (computing_diag_num[0] <= computing_diag_num[1]) {
-        assignPartitionParams(
-            s_block_size, t_block_size, s_repeat, t_repeat, s_remainder,
-            t_remainder, final_s_block_size, final_t_block_size, final_s_repeat,
-            final_t_repeat, final_s_remainder, final_t_remainder, 0);
-      } else {
-        assignPartitionParams(
-            s_block_size, t_block_size, s_repeat, t_repeat, s_remainder,
-            t_remainder, final_s_block_size, final_t_block_size, final_s_repeat,
-            final_t_repeat, final_s_remainder, final_t_remainder, 1);
-      }
-    } else {  // S > N_size, T > N_size, choose between (N,N), (S,t), (s,T)
-      // 1) deal_s = deal_t = N_size; min(s,t) = s = t;
-      mode = 0;
-      s_block_size[0] = N_size;
-      t_block_size[0] = N_size;
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      // 2) deal_s = S, deal_t = t; min(s,t) = t;
-      mode = 1;
-      s_block_size[1] = N_size;
-      t_block_size[1] = (nram_size / sizeof(float) - 2 * s_block_size[1]) /
-                        (4 * s_block_size[1] + 8);
-      if (t_block_size[1] > 0) {
-        calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                          s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      } else {
-        computing_diag_num[1] = -1;  // not support on this partition
-      }
-      // 3) deal_t = T, deal_s = s; min(s,t) = s;
-      mode = 2;
-      t_block_size[2] = T;
-      s_block_size[2] = (nram_size / sizeof(float) - 2 * t_block_size[2]) /
-                        (4 * t_block_size[2] + 8);
-      if (s_block_size[2] > 0) {
-        calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                          s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      } else {
-        computing_diag_num[2] = -1;  // not support on this partition
-      }
-
-      if (computing_diag_num[0] > 0 &&      // mode 0 is valid
-          ((computing_diag_num[1] <= 0) ||  // mode 1 is invalid or
-           computing_diag_num[0] <=
-               computing_diag_num[1])) {  // mode 0 is better than mode 1
-        if (computing_diag_num[2] > 0 &&  // mode 2 is valid and
-            computing_diag_num[2] <
-                computing_diag_num[0]) {  // mode 2 is better than mode 0
-          // choose mode 2
-          assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat,
-                                s_remainder, t_remainder, final_s_block_size,
-                                final_t_block_size, final_s_repeat,
-                                final_t_repeat, final_s_remainder,
-                                final_t_remainder, 2);
-        } else {
-          // choose mode 0
-          assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat,
-                                s_remainder, t_remainder, final_s_block_size,
-                                final_t_block_size, final_s_repeat,
-                                final_t_repeat, final_s_remainder,
-                                final_t_remainder, 0);
-        }
-      } else {  // mode 1 is valid and mode 1 is better than mode 0
-        if (computing_diag_num[2] > 0 &&  // mode 2 is valid
-            computing_diag_num[2] <
-                computing_diag_num[1]) {  // mode 2 is better than mode 1
-          // choose mode 2
-          assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat,
-                                s_remainder, t_remainder, final_s_block_size,
-                                final_t_block_size, final_s_repeat,
-                                final_t_repeat, final_s_remainder,
-                                final_t_remainder, 2);
-        } else {
-          // choose mode 1
-          assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat,
-                                s_remainder, t_remainder, final_s_block_size,
-                                final_t_block_size, final_s_repeat,
-                                final_t_repeat, final_s_remainder,
-                                final_t_remainder, 1);
-        }
-      }
-    }
-    // total job diagonal number in parallel
-    job_diag_num = final_s_repeat + (int)(final_s_remainder > 0) +
-                   final_t_repeat + (int)(final_t_remainder > 0) - 1;
-  }
-}
-
-static mluOpStatus_t launchMutualInformationBackwardDefaultKernel(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const bool has_boundary, const void *opt_boundary, const void *p,
-    const bool overwrite_ans_grad, void *ans_grad, void *px_grad, void *py_grad,
-    void *p_grad) {
-  // At first, use Fill Op to set px_grad, py_grad to all 0
-  VLOG(5) << API_NAME << " cnnlFill_v3 start.";
-  uint64_t fill_value = 0x0;
-  if (mluOpGetTensorElementNum(px_desc) > 0) {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(px_desc, cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, px_grad));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  if (mluOpGetTensorElementNum(py_desc) > 0) {
-    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(py_desc, cnnl_output_desc);
-    CALL_CNNL(cnnlFill_v3(cnnl_handle, CNNL_POINTER_MODE_HOST, &fill_value,
-                          cnnl_output_desc, py_grad));
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-    DESTROY_CNNL_HANDLE(cnnl_handle);
-  }
-  VLOG(5) << API_NAME << " cnnlFill_v3 end.";
-
-  // When S and T is too large, launch default kernel with partition of S and T
-  // 1. Compute current arch max N size, according to NRAM size and device RAM
-  // 2. Use max_N_size to calculate different partition mode computing diagonal
-  //    numbers and choose the partition mode, which has the least computing
-  //    diagonal number
-  // 3. Launch default kernels by diagonal in parallel, with check of MaxDimX
-
-  const int B = px_desc->dims[0];
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  // 1. According to on-chip RAM size, calculate current arch partition block
-  // size by square, Use max_N_size to partition on S and T dimension RAM space:
-  //   2*S*T + 2*(S+1)*(T+1) + 2*min(S,T) + 4*min(S,T)+1
-  int max_N_size = (int)(std::sqrt(handle->nram_size / sizeof(float) / 4)) - 2;
-  // Use max square size N, partition on T and S dimension, launch by diagonal:
-  // -|------T--------|
-  // :| N1| N2| N3| N4|
-  // :|---|---|---|---|
-  // S| N2| N3| N4| N5|
-  // :|---|---|---|---|
-  // :| N3| N4| N5| N6|
-  // -|---------------|
-
-  VLOG(5) << "Current arch Max square N size is " << max_N_size;
-
-  int job_diag_num;  // number of default kernel launch steps by diagonal
-  int s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, t_remainder;
-
-  // 2. Choose the partition mode, which has the least computing diagonal number
-  // NOTE: p_grad has dimension (S+1, T+1), in function directly use (S, T)
-  // instead
-  calDefaultPartition(S + 1, T + 1, max_N_size, handle->nram_size, job_diag_num,
-                      s_block_size, t_block_size, s_repeat, t_repeat,
-                      s_remainder, t_remainder);
-  int s_block_num = s_repeat + (int)(s_remainder > 0);
-  int t_block_num = t_repeat + (int)(t_remainder > 0);
-  int max_s_t_block_num = std::max(s_block_num, t_block_num);
-  int min_s_t_block_num = std::min(s_block_num, t_block_num);
-
-  k_type = CNRT_FUNC_TYPE_BLOCK;
-  k_dim.y = 1;
-  k_dim.z = 1;
-  // Get current arch support max dim_x value
-  int task_dim_x_limit;
-  cnDeviceGetAttribute(&task_dim_x_limit,
-                       CN_DEVICE_ATTRIBUTE_MAX_BLOCK_TASK_DIM_X,
-                       handle->device);
-  VLOG(5) << "Current arch MAX_BLOCK_TASK_DIM_X is " << task_dim_x_limit;
-
-  // 3. Traverse step_i from 0 to (job_diag_num - 1)
-  for (int step_i = 0; step_i < job_diag_num; step_i++) {
-    int job_num_on_step = B * (step_i < max_s_t_block_num
-                                   ? std::min(step_i + 1, min_s_t_block_num)
-                                   : s_block_num + t_block_num - step_i - 1);
-    k_dim.x = job_num_on_step;
-    // Make sure not exceed max dim x limit
-    if (k_dim.x > task_dim_x_limit) {
-      int task_dim_change = (k_dim.x + task_dim_x_limit - 1) / task_dim_x_limit;
-      k_dim.x = (k_dim.x + task_dim_x_limit - 1) / task_dim_change;
-      k_dim.y = k_dim.y * task_dim_change;
-    }
-
-    VLOG(5) << "Launch Kernel DefaultMutualInformationBackward<<< step "
-            << step_i << " of Batch Block: " << k_dim.x << ", " << k_dim.y
-            << ", " << k_dim.z << ">>>";
-    CHECK_RETURN("[MutualInformationBackward]",
-                 kernelDefaultMutualInformationBackward(
-                     k_dim, k_type, handle->queue, B, S, T, step_i,
-                     job_num_on_step, s_block_num, t_block_num, s_block_size,
-                     t_block_size, px, py, has_boundary, opt_boundary, p,
-                     overwrite_ans_grad, ans_grad, px_grad, py_grad, p_grad));
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpMutualInformationBackward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary,
-    const mluOpTensorDescriptor_t p_desc, const void *p,
-    const mluOpTensorDescriptor_t ans_grad_desc, void *ans_grad,
-    const bool overwrite_ans_grad, void *workspace, const size_t workspace_size,
-    const mluOpTensorDescriptor_t px_grad_desc, void *px_grad,
-    const mluOpTensorDescriptor_t py_grad_desc, void *py_grad) {
-  // 1. Paramcheck
-  bool has_boundary = false;
-  bool zero_element = false;
-  mluOpStatus_t check_status = mutualInformationBackwardParamCheck(
-      handle, px_desc, px, py_desc, py, opt_boundary_desc, opt_boundary, p_desc,
-      p, ans_grad_desc, ans_grad, workspace, workspace_size, px_grad_desc,
-      px_grad, py_grad_desc, py_grad, has_boundary, zero_element);
-
-  if (MLUOP_STATUS_SUCCESS != check_status || zero_element) {
-    return check_status;
-  }
-
-  // 2. Generate case
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    mutualInformationBackwardGencase(
-        handle, px_desc, px, py_desc, py, opt_boundary_desc, opt_boundary,
-        p_desc, p, ans_grad_desc, ans_grad, overwrite_ans_grad, px_grad_desc,
-        px_grad, py_grad_desc, py_grad);
-  }
-
-  // Choose to launch 3pipeline or default kernel
-  const int B = px_desc->dims[0];
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-
-  bool is_launch_3pipeline = true;
-  // check 3pipeline scale limit for computing term1 and term2
-  int current_size = T * (S + 1) + (T + 1) * S + 5 * (T + 1);
-  if (current_size > handle->nram_size / sizeof(float)) {
-    is_launch_3pipeline = false;
-  }
-
-  // check 3pipeline scale limit for computing p_grad
-  current_size =
-      T * (S + 1) + (T + 1) * S + (T + 1) * (S + 1) + 3 * std::min(S, T) + 4;
-  if (current_size > handle->nram_size / sizeof(float)) {
-    is_launch_3pipeline = false;
-  }
-
-  // 3. launch kernel
-  mluOpStatus_t return_status;
-  if (is_launch_3pipeline) {
-    // launch 3pipeline kernel when satisfy scale limit
-    return_status = launchMutualInformationBackward3PipelineKernel(
-        handle, px_desc, px, py_desc, py, has_boundary, opt_boundary, p,
-        overwrite_ans_grad, ans_grad, px_grad, py_grad);
-  } else {
-    // launch default kernel, workspace is for p_grad
-    return_status = launchMutualInformationBackwardDefaultKernel(
-        handle, px_desc, px, py_desc, py, has_boundary, opt_boundary, p,
-        overwrite_ans_grad, ans_grad, px_grad, py_grad, workspace);
-  }
-
-  GEN_CASE_END();
-  return return_status;
-}
diff --git a/kernels/mutual_information_backward/mutual_information_backward.h b/kernels/mutual_information_backward/mutual_information_backward.h
deleted file mode 100644
index 5a4a477e6..000000000
--- a/kernels/mutual_information_backward/mutual_information_backward.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_H_
-#define KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_H_
-
-#include "mlu_op.h"
-#include "kernels/kernel.h"
-
-mluOpStatus_t MLUOP_WIN_API kernel3PipelineMutualInformationBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B,
-    const int S, const int T, const void *px, const void *py,
-    const bool has_boundary, const void *opt_boundary, const void *p,
-    const bool overwrite_ans_grad, void *ans_grad, void *px_grad,
-    void *py_grad);
-
-mluOpStatus_t MLUOP_WIN_API kernelDefaultMutualInformationBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B,
-    const int S, const int T, const int step_i, const int job_num_on_step,
-    const int s_block_num, const int t_block_num, const int s_block_size,
-    const int t_block_size, const void *px, const void *py,
-    const bool has_boundary, const void *opt_boundary, const void *p,
-    const bool overwrite_ans_grad, void *ans_grad, void *px_grad, void *py_grad,
-    void *p_grad);
-
-#endif  // KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_H_
diff --git a/kernels/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu b/kernels/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu
deleted file mode 100644
index b55829d11..000000000
--- a/kernels/mutual_information_backward/mutual_information_backward_3pipeline_block.mlu
+++ /dev/null
@@ -1,289 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "mutual_information_backward.h"
-
-#include "core/logging.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-#include "kernels/mutual_information_backward/mutual_information_backward_utils.h"
-
-__mlu_func__ void computeTerm1AndTerm2(const int b, const int S, const int T,
-                                       const int s_begin, const int s_end,
-                                       const int t_begin, const int t_end,
-                                       const float *px, const float *py,
-                                       const float *p) {
-  /* *********************nram space split********************** */
-  /* |  term1  |  term2  | cur_p | next_p | large_neg |  mask |*/
-  /* | S*(T+1) | (S+1)*T | t_len | t_len  |  2*t_len  | t_len |*/
-  float *nram_term1 = (float *)nram_buffer;
-  float *nram_term2 = nram_term1 + S * (T + 1);
-  float *nram_cur_p = nram_term2 + (S + 1) * T;
-
-  int t_len = t_end - t_begin + 1;
-
-  float *nram_next_p = nram_cur_p + t_len;
-  float *nram_large_neg = nram_next_p + t_len;
-  float *nram_mask = nram_large_neg + 2 * t_len;
-
-  __bang_write_value(nram_large_neg, 2 * t_len, (float)-1.0e+30);
-
-  for (int i = s_begin; i < s_end; ++i) {
-    // load p to cur_p and next_p
-    __memcpy(nram_cur_p, p + b * (S + 1) * (T + 1) + i * (T + 1) + t_begin,
-             t_len * sizeof(float), GDRAM2NRAM, t_len * sizeof(float),
-             (T + 1) * sizeof(float), 1);
-    __bang_nan_maximum(nram_cur_p, nram_cur_p, nram_large_neg, 2 * t_len);
-
-    // load px to term1
-    __memcpy(nram_term1 + i * (T + 1) + t_begin,
-             px + b * S * (T + 1) + i * (T + 1) + t_begin,
-             t_len * sizeof(float), GDRAM2NRAM);
-    __bang_fusion(FUSION_FAS, nram_term1 + i * (T + 1) + t_begin,
-                  nram_term1 + i * (T + 1) + t_begin, nram_cur_p, nram_next_p,
-                  t_len, t_len);
-    safeExp(nram_term1 + i * (T + 1) + t_begin,
-            nram_term1 + i * (T + 1) + t_begin, nram_mask, t_len);
-
-    if (t_len > 1) {
-      // load py to term2
-      __memcpy(nram_term2 + i * T + t_begin,
-               py + b * (S + 1) * T + i * T + t_begin,
-               (t_len - 1) * sizeof(float), GDRAM2NRAM);
-      __bang_fusion(FUSION_FAS, nram_term2 + i * T + t_begin,
-                    nram_term2 + i * T + t_begin, nram_cur_p, nram_cur_p + 1,
-                    t_len - 1, t_len - 1);
-      safeExp(nram_term2 + i * T + t_begin, nram_term2 + i * T + t_begin,
-              nram_mask, t_len - 1);
-    }
-  }
-
-  if (t_len > 1) {
-    if (s_begin == s_end) {
-      // load p to next_p
-      __memcpy(nram_next_p,
-               p + b * (S + 1) * (T + 1) + s_end * (T + 1) + t_begin,
-               t_len * sizeof(float), GDRAM2NRAM);
-      __bang_nan_maximum(nram_next_p, nram_next_p, nram_large_neg, t_len);
-    }
-    // compute term2[s_end][:]
-    __memcpy(nram_term2 + s_end * T + t_begin,
-             py + b * (S + 1) * T + s_end * T + t_begin,
-             (t_len - 1) * sizeof(float), GDRAM2NRAM);
-    __bang_fusion(FUSION_FAS, nram_term2 + s_end * T + t_begin,
-                  nram_term2 + s_end * T + t_begin, nram_next_p,
-                  nram_next_p + 1, t_len - 1, t_len - 1);
-    safeExp(nram_term2 + s_end * T + t_begin, nram_term2 + s_end * T + t_begin,
-            nram_mask, t_len - 1);
-  }
-}
-
-__mlu_func__ void computePGrad(const int b, const int S, const int T,
-                               const int s_begin, const int s_end,
-                               const int t_begin, const int t_end,
-                               const bool overwrite_ans_grad, float *ans_grad) {
-  /* ***************************nram space split*************************** */
-  /* |  term1  |  term2  |   p_grad  | cur_term1|zero|cur_term2|cur_p_grad| */
-  /* | S*(T+1) | (S+1)*T |(S+1)*(T+1)|  min_len | 1  | min_len |  min_len | */
-  float *nram_term1 = (float *)nram_buffer;
-  float *nram_term2 = nram_term1 + S * (T + 1);
-  float *nram_p_grad = nram_term2 + (S + 1) * T;
-  float *nram_cur_term1 = nram_p_grad + (S + 1) * (T + 1);
-
-  int s_len = s_end - s_begin + 1;
-  int t_len = t_end - t_begin + 1;
-  int max_len = __mluop_max(s_len, t_len);
-  int min_len = __mluop_min(s_len, t_len);
-
-  float *nram_cur_term2 = nram_cur_term1 + min_len + 1;
-  float *nram_cur_p_grad = nram_cur_term2 + min_len;
-  __bang_write_zero(nram_cur_term1, 3 * min_len + 1);
-
-  // compute the last one: p_grad[b][s_end][t_end] = ans_grad[b]
-  __memcpy_async(nram_p_grad + s_end * (T + 1) + t_end, ans_grad + b,
-                 sizeof(float), GDRAM2NRAM);
-  __sync();
-  nram_cur_p_grad[0] = nram_p_grad[s_end * (T + 1) + t_end];
-
-  int data_num = 0;
-  int s = 0;
-  int t = 0;
-  int term2_s = 0;
-  int term2_t = 0;
-  int term1_num = 0;
-  int term2_num = 0;
-  float *nram_p_grad_for_compute_term1 = nram_cur_p_grad;
-  float *nram_compute_term2 = nram_cur_term2;
-
-  int loop_time = s_len + t_len - 1;
-  for (int i = 1; i < loop_time; ++i) {
-    data_num = i < max_len ? __mluop_min(i + 1, min_len) : loop_time - i;
-    s = i < s_len ? s_end - i : s_begin;
-    t = i < s_len ? t_end : t_end + s_len - i - 1;
-
-    term1_num = i < t_len ? data_num - 1 : data_num;
-    if (term1_num > 0) {
-      __memcpy(nram_cur_term1, nram_term1 + s * (T + 1) + t, sizeof(float),
-               NRAM2NRAM, sizeof(float), T * sizeof(float), term1_num - 1);
-      nram_p_grad_for_compute_term1 =
-          i >= s_len ? nram_cur_p_grad + 1 : nram_cur_p_grad;
-      __bang_mul(nram_cur_term1, nram_cur_term1, nram_p_grad_for_compute_term1,
-                 term1_num);
-    }
-
-    term2_num = data_num;
-    nram_compute_term2 = nram_cur_term2;
-    term2_s = s;
-    term2_t = t;
-    if (i < s_len) {
-      term2_num -= 1;
-      nram_compute_term2 -= 1;
-      term2_s += 1;
-      term2_t -= 1;
-    }
-    if (term2_num > 0) {
-      __memcpy(nram_cur_term2, nram_term2 + term2_s * T + term2_t,
-               sizeof(float), NRAM2NRAM, sizeof(float), (T - 1) * sizeof(float),
-               term2_num - 1);
-      __bang_mul(nram_cur_term2, nram_cur_term2, nram_cur_p_grad, term2_num);
-    }
-
-    __bang_add(nram_cur_p_grad, nram_cur_term1, nram_compute_term2, data_num);
-    __memcpy(nram_p_grad + s * (T + 1) + t, nram_cur_p_grad, sizeof(float),
-             NRAM2NRAM, T * sizeof(float), sizeof(float), data_num - 1);
-  }
-
-  if (overwrite_ans_grad) {
-    __memcpy(ans_grad + b, nram_p_grad + s_begin * (T + 1) + t_begin,
-             sizeof(float), NRAM2GDRAM);
-  }
-}
-
-__mlu_func__ void computePxGradAndPyGrad(const int b, const int S, const int T,
-                                         const int s_begin, const int s_end,
-                                         const int t_begin, const int t_end,
-                                         float *px_grad, float *py_grad) {
-  /* ***********nram space split********** */
-  /* |  term1  |  term2  |     p_grad    | */
-  /* | S*(T+1) | (S+1)*T |  (S+1)*(T+1)  | */
-  float *nram_term1 = (float *)nram_buffer;
-  float *nram_term2 = nram_term1 + S * (T + 1);
-  float *nram_p_grad = nram_term2 + (S + 1) * T;
-
-  int t_len = t_end - t_begin + 1;
-
-  for (int i = s_begin; i < s_end; ++i) {
-    // compute term1
-    __bang_mul(nram_term1 + i * (T + 1) + t_begin,
-               nram_term1 + i * (T + 1) + t_begin,
-               nram_p_grad + (i + 1) * (T + 1) + t_begin, t_len);
-
-    if (t_len > 1) {
-      // compute term2
-      __bang_mul(nram_term2 + i * T + t_begin, nram_term2 + i * T + t_begin,
-                 nram_p_grad + i * (T + 1) + t_begin + 1, t_len - 1);
-    }
-  }
-
-  if (t_len > 1) {
-    // compute term2[s_end][:]
-    __bang_mul(nram_term2 + s_end * T + t_begin,
-               nram_term2 + s_end * T + t_begin,
-               nram_p_grad + s_end * (T + 1) + t_begin + 1, t_len - 1);
-  }
-
-  if (S > 0) {
-    __memcpy(px_grad + b * S * (T + 1), nram_term1, S * (T + 1) * sizeof(float),
-             NRAM2GDRAM);
-  }
-  if (T > 0) {
-    __memcpy(py_grad + b * (S + 1) * T, nram_term2, (S + 1) * T * sizeof(float),
-             NRAM2GDRAM);
-  }
-}
-
-__mlu_global__ void mluBlock3PipelineMutualInformationBackward(
-    const int B, const int S, const int T, const float *px, const float *py,
-    const bool has_boundary, const int64_t *opt_boundary, const float *p,
-    const bool overwrite_ans_grad, float *ans_grad, float *px_grad,
-    float *py_grad) {
-  const int num_per_core = B / taskDim;
-  const int num_rem = B % taskDim;
-  const int num_cur_core = num_per_core + (taskId < num_rem);
-  const int b_offset = taskId * num_cur_core + (taskId >= num_rem) * num_rem;
-
-  int s_begin = 0;
-  int t_begin = 0;
-  int s_end = S;
-  int t_end = T;
-  if (has_boundary) {
-    int64_t *boundary = (int64_t *)nram_buffer;
-    for (int b = b_offset; b < b_offset + num_cur_core; ++b) {
-      __memcpy(boundary, opt_boundary + 4 * b, 4 * sizeof(int64_t), GDRAM2NRAM);
-      s_begin = boundary[0];
-      t_begin = boundary[1];
-      s_end = boundary[2];
-      t_end = boundary[3];
-      __bang_write_zero((float *)nram_buffer, S * (T + 1) + (S + 1) * T);
-
-      if (s_begin > s_end || t_begin > t_end) {
-        if (S > 0) {
-          __memcpy(px_grad + b * S * (T + 1), (float *)nram_buffer,
-                   S * (T + 1) * sizeof(float), NRAM2GDRAM);
-        }
-        if (T > 0) {
-          __memcpy(py_grad + b * (S + 1) * T,
-                   (float *)nram_buffer + S * (T + 1),
-                   (S + 1) * T * sizeof(float), NRAM2GDRAM);
-        }
-        continue;
-      }
-      computeTerm1AndTerm2(b, S, T, s_begin, s_end, t_begin, t_end, px, py, p);
-      computePGrad(b, S, T, s_begin, s_end, t_begin, t_end, overwrite_ans_grad,
-                   ans_grad);
-      computePxGradAndPyGrad(b, S, T, s_begin, s_end, t_begin, t_end, px_grad,
-                             py_grad);
-    }
-  } else {
-    for (int b = b_offset; b < b_offset + num_cur_core; ++b) {
-      computeTerm1AndTerm2(b, S, T, s_begin, s_end, t_begin, t_end, px, py, p);
-      computePGrad(b, S, T, s_begin, s_end, t_begin, t_end, overwrite_ans_grad,
-                   ans_grad);
-      computePxGradAndPyGrad(b, S, T, s_begin, s_end, t_begin, t_end, px_grad,
-                             py_grad);
-    }
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API kernel3PipelineMutualInformationBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B,
-    const int S, const int T, const void *px, const void *py,
-    const bool has_boundary, const void *opt_boundary, const void *p,
-    const bool overwrite_ans_grad, void *ans_grad, void *px_grad,
-    void *py_grad) {
-  KERNEL_CHECK(
-      mluBlock3PipelineMutualInformationBackward<<<k_dim, k_type, queue>>>(
-          B, S, T, (float *)px, (float *)py, has_boundary,
-          (int64_t *)opt_boundary, (float *)p, overwrite_ans_grad,
-          (float *)ans_grad, (float *)px_grad, (float *)py_grad));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/mutual_information_backward/mutual_information_backward_default_block.mlu b/kernels/mutual_information_backward/mutual_information_backward_default_block.mlu
deleted file mode 100644
index 3f3a5e3f2..000000000
--- a/kernels/mutual_information_backward/mutual_information_backward_default_block.mlu
+++ /dev/null
@@ -1,455 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "mutual_information_backward.h"
-
-#include "core/logging.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-#include "kernels/mutual_information_backward/mutual_information_backward_utils.h"
-
-__mlu_func__ bool calPartitionJobScope(
-    bool has_boundary, const int64_t *opt_boundary, const int B, const int S,
-    const int T, const int step_i, const int job_num_on_step,
-    const int s_block_num, const int t_block_num, const int s_block_size,
-    const int t_block_size, int &batch_idx, int &batch_s_begin,
-    int &batch_t_begin, int &batch_s_end, int &batch_t_end, int &cur_s_begin,
-    int &cur_t_begin, int &cur_s_end, int &cur_t_end, int &cur_s_size,
-    int &cur_t_size, bool &need_compute_ans_grad, bool overwrite_ans_grad,
-    float *px_grad, float *py_grad) {
-  int job_num_on_batch = job_num_on_step / B;  // Each batch job num
-  batch_idx = taskId / job_num_on_batch;       // Current job on which batch
-  int block_id_in_batch =
-      taskId - batch_idx * job_num_on_batch;  // Current job id in batch
-
-  // taskDim is not always job num, because of TASK_DIM_X limit
-  if (batch_idx >= B) {
-    return true;
-  }
-
-  // Compute s and t block id in batch
-  int s_block_id, t_block_id;
-  s_block_id = __mluop_max(0, s_block_num - 1 - step_i) + block_id_in_batch;
-  t_block_id =
-      __mluop_min(t_block_num - 1, s_block_num + t_block_num - 2 - step_i) -
-      block_id_in_batch;
-
-  // Compute current job id scope
-  cur_s_begin = s_block_id * s_block_size;
-  cur_t_begin = t_block_id * t_block_size;
-  cur_s_end = (s_block_id + 1) * s_block_size - 1;
-  cur_t_end = (t_block_id + 1) * t_block_size - 1;
-
-  // Deal with boundary and decide current job if need to compute
-  if (has_boundary) {
-    int64_t *boundary = (int64_t *)nram_buffer;
-    __memcpy(boundary, opt_boundary + 4 * batch_idx, 4 * sizeof(int64_t),
-             GDRAM2NRAM);
-    batch_s_begin = boundary[0];
-    batch_t_begin = boundary[1];
-    batch_s_end = boundary[2];
-    batch_t_end = boundary[3];
-    // invalid boundary, already use cnnlFill to set px_grad and py_grad to 0
-    if (batch_s_begin > batch_s_end || batch_t_begin > batch_t_end) {
-      return true;
-    }
-  }
-
-  // Compare current job scope with batch scope, if empty job, return
-  if (cur_s_begin > batch_s_end || cur_t_begin > batch_t_end ||
-      cur_s_end < batch_s_begin || cur_t_end < batch_t_begin) {
-    return true;
-  }
-
-  // Reset s and t begin and end to valid boundary
-  if (cur_s_begin < batch_s_begin) {
-    cur_s_begin = batch_s_begin;
-  }
-  if (cur_t_begin < batch_t_begin) {
-    cur_t_begin = batch_t_begin;
-  }
-  if (cur_s_end > batch_s_end) {
-    cur_s_end = batch_s_end;
-  }
-  if (cur_t_end > batch_t_end) {
-    cur_t_end = batch_t_end;
-  }
-
-  cur_s_size = cur_s_end - cur_s_begin + 1;
-  cur_t_size = cur_t_end - cur_t_begin + 1;
-
-  // At last compute step and overwrite, need to memcpy back to ans_grad
-  if (overwrite_ans_grad && cur_s_begin == batch_s_begin &&
-      cur_t_begin == batch_t_begin) {
-    need_compute_ans_grad = true;
-  } else {
-    need_compute_ans_grad = false;
-  }
-
-  return false;
-}
-
-__mlu_func__ void loadInit(const float *gdram_px, const float *gdram_py,
-                           const float *gdram_p, float *gdram_p_grad,
-                           float *nram_px, float *nram_py, float *nram_p,
-                           float *nram_p_grad, const int S, const int T,
-                           const int batch_s_end, const int batch_t_end,
-                           const int cur_s_begin, const int cur_t_begin,
-                           const int cur_s_end, const int cur_t_end,
-                           const int cur_s_size, const int cur_t_size) {
-  // Load p(s, t)
-  __memcpy_async(nram_p, gdram_p + cur_s_begin * (T + 1) + cur_t_begin,
-                 cur_t_size * sizeof(float), GDRAM2NRAM,
-                 (cur_t_size + 1) * sizeof(float), (T + 1) * sizeof(float),
-                 cur_s_size - 1);
-
-  // Compare current s_end and batch_s_end to decide:
-  // load px or write -inf, load p or write large_neg, load p_grad or write 0
-  if (cur_s_end < batch_s_end) {
-    // Load px(s, t)
-    __memcpy_async(nram_px, gdram_px + cur_s_begin * (T + 1) + cur_t_begin,
-                   cur_t_size * sizeof(float), GDRAM2NRAM,
-                   cur_t_size * sizeof(float), (T + 1) * sizeof(float),
-                   cur_s_size - 1);
-    // Load p(s+1, t), one row
-    __memcpy_async(nram_p + cur_s_size * (cur_t_size + 1),
-                   gdram_p + (cur_s_end + 1) * (T + 1) + cur_t_begin,
-                   cur_t_size * sizeof(float), GDRAM2NRAM, 0, 0, 0);
-    // load p_grad(s+1, t), one row
-    __memcpy_async(nram_p_grad + cur_s_size * (cur_t_size + 1),
-                   gdram_p_grad + (cur_s_end + 1) * (T + 1) + cur_t_begin,
-                   cur_t_size * sizeof(float), GDRAM2NRAM, 0, 0, 0);
-  } else {  // cur_s_end == batch_s_end, skip last row, write value
-    if (cur_s_size > 1) {
-      __memcpy_async(nram_px, gdram_px + cur_s_begin * (T + 1) + cur_t_begin,
-                     cur_t_size * sizeof(float), GDRAM2NRAM,
-                     cur_t_size * sizeof(float), (T + 1) * sizeof(float),
-                     cur_s_size - 2);
-    }
-    // write -inf at px last row
-    __nramset_async(nram_px + (cur_s_size - 1) * cur_t_size, cur_t_size,
-                    (float)(-INFINITY), 0, 0);
-    // write large_neg at p last row
-    __nramset_async(nram_p + cur_s_size * (cur_t_size + 1), cur_t_size,
-                    (float)-1.0e+30, 0, 0);
-    // write 0 at p_grad last row
-    __nramset_async(nram_p_grad + cur_s_size * (cur_t_size + 1), cur_t_size,
-                    (float)0.0, 0, 0);
-  }
-
-  // Compare current t_end and batch_t_end to decide:
-  // load py or write -inf, load p or write large_neg, load p_grad or write 0
-  if (cur_t_end < batch_t_end) {
-    // Load py(s, t)
-    __memcpy_async(nram_py, gdram_py + cur_s_begin * T + cur_t_begin,
-                   cur_t_size * sizeof(float), GDRAM2NRAM,
-                   cur_t_size * sizeof(float), T * sizeof(float),
-                   cur_s_size - 1);
-    // Load p(s, t+1), one column
-    __memcpy_async(nram_p + cur_t_size,
-                   gdram_p + cur_s_begin * (T + 1) + cur_t_end + 1,
-                   sizeof(float), GDRAM2NRAM, (cur_t_size + 1) * sizeof(float),
-                   (T + 1) * sizeof(float), cur_s_size - 1);
-    // Load p_grad(s, t+1), one column
-    __memcpy_async(nram_p_grad + cur_t_size,
-                   gdram_p_grad + cur_s_begin * (T + 1) + cur_t_end + 1,
-                   sizeof(float), GDRAM2NRAM, (cur_t_size + 1) * sizeof(float),
-                   (T + 1) * sizeof(float), cur_s_size - 1);
-  } else {  // cur_t_end == batch_t_end, skip last column, write value
-    // Load py(s, t)
-    if (cur_t_size > 1) {
-      __memcpy_async(nram_py, gdram_py + cur_s_begin * T + cur_t_begin,
-                     (cur_t_size - 1) * sizeof(float), GDRAM2NRAM,
-                     cur_t_size * sizeof(float), T * sizeof(float),
-                     cur_s_size - 1);
-    }
-    // write -inf at py last column
-    __nramset_async(nram_py + cur_t_size - 1, 1, (float)(-INFINITY),
-                    cur_t_size * sizeof(float), cur_s_size - 1);
-    // write large_neg at p last column
-    __nramset_async(nram_p + cur_t_size, 1, (float)-1.0e+30,
-                    (cur_t_size + 1) * sizeof(float), cur_s_size - 1);
-    // write 0 at p_grad last column
-    __nramset_async(nram_p_grad + cur_t_size, 1, (float)0.0,
-                    (cur_t_size + 1) * sizeof(float), cur_s_size - 1);
-  }
-}
-
-__mlu_func__ void computeByDiagonal(
-    float *nram_px, float *nram_py, float *nram_p, float *nram_p_grad,
-    float *nram_cur_px, float *nram_cur_py, float *nram_cur_p,
-    float *nram_next_p, float *nram_large_neg, float *nram_mask,
-    float *gdram_ans_grad, const int batch_s_end, const int batch_t_end,
-    const int cur_s_end, const int cur_t_end, const int cur_s_size,
-    const int cur_t_size) {
-  const int repeat = cur_s_size + cur_t_size - 1;
-  const int max_s_t = __mluop_max(cur_s_size, cur_t_size);
-  const int min_s_t = __mluop_min(cur_s_size, cur_t_size);
-
-  for (int i = 0; i < repeat; ++i) {
-    int data_num = i < max_s_t ? __mluop_min(i + 1, min_s_t)
-                               : cur_s_size + cur_t_size - i - 1;
-
-    // px, py use same s, t index on nram,
-    int first_s = __mluop_max(0, cur_s_size - 1 - i);
-    int first_t = __mluop_min(cur_t_size - 1, cur_s_size + cur_t_size - 2 - i);
-
-    // memcpy_async cur_px, cur_py,
-    // memcpy cur_p(same index, data_num), next_p(next index, data_num+1)
-    __memcpy(nram_cur_p, nram_p + first_s * (cur_t_size + 1) + first_t,
-             sizeof(float), NRAM2NRAM, sizeof(float),
-             cur_t_size * sizeof(float), data_num - 1);
-    __memcpy(nram_next_p, nram_p + first_s * (cur_t_size + 1) + first_t + 1,
-             sizeof(float), NRAM2NRAM, sizeof(float),
-             cur_t_size * sizeof(float), data_num);
-    __memcpy_async(nram_cur_px, nram_px + first_s * cur_t_size + first_t,
-                   sizeof(float), NRAM2NRAM, sizeof(float),
-                   (cur_t_size - 1) * sizeof(float), data_num - 1);
-    __memcpy_async(nram_cur_py, nram_py + first_s * cur_t_size + first_t,
-                   sizeof(float), NRAM2NRAM, sizeof(float),
-                   (cur_t_size - 1) * sizeof(float), data_num - 1);
-
-    // make cur_p and next_p number < -1.0e+30 to -1.0e+30
-    __bang_nan_maximum(nram_cur_p, nram_cur_p, nram_large_neg, data_num);
-    __bang_nan_maximum(nram_next_p, nram_next_p, nram_large_neg, data_num + 1);
-
-    // sync for cur_px and cur_py
-    __sync();
-
-    // Compute term1 and term2, reuse cur_px, cur_py RAM
-    // cur_term1(s, t) = exp(cur_p(s, t) + cur_px(s, t) - next_p(s + 1, t));
-    __bang_fusion(FUSION_FAS, nram_cur_px, nram_cur_px, nram_cur_p,
-                  nram_next_p + 1, data_num, data_num);
-    // cur_term2(s, t) = exp(cur_p(s, t) + cur_py(s, t) - next_p(s, t + 1));
-    __bang_fusion(FUSION_FAS, nram_cur_py, nram_cur_py, nram_cur_p, nram_next_p,
-                  data_num, data_num);
-
-    // sync for next_p
-    __sync();
-    // memcpy_async next_p_grad to nram_next_p
-    __memcpy_async(nram_next_p,
-                   nram_p_grad + first_s * (cur_t_size + 1) + first_t + 1,
-                   sizeof(float), NRAM2NRAM, sizeof(float),
-                   cur_t_size * sizeof(float), data_num);
-
-    // safeExp for term1 and term2
-    safeExp(nram_cur_px, nram_cur_px, nram_mask, data_num);
-    safeExp(nram_cur_py, nram_cur_py, nram_mask, data_num);
-
-    // sync for next_p_grad
-    __sync();
-
-    // Compute px_grad and py_grad
-    // cur_px_grad = cur_term1 * next_p_grad(s + 1, t)
-    __bang_mul(nram_cur_px, nram_cur_px, nram_next_p + 1, data_num);
-    // cur_py_grad = cur_term2 * next_p_grad(s, t + 1)
-    __bang_mul(nram_cur_py, nram_cur_py, nram_next_p, data_num);
-
-    // sync for cur_px_grad and cur_py_grad
-    __sync();
-
-    // memcpy_async back to px_grad, py_grad
-    __memcpy_async(nram_px + first_s * cur_t_size + first_t, nram_cur_px,
-                   sizeof(float), NRAM2NRAM, (cur_t_size - 1) * sizeof(float),
-                   sizeof(float), data_num - 1);
-    __memcpy_async(nram_py + first_s * cur_t_size + first_t, nram_cur_py,
-                   sizeof(float), NRAM2NRAM, (cur_t_size - 1) * sizeof(float),
-                   sizeof(float), data_num - 1);
-
-    // Compute p_grad
-    if (cur_s_end == batch_s_end && cur_t_end == batch_t_end && i == 0) {
-      // step 0, Initialize p_grad[s_end][t_end] = ans_grad[b]
-      __memcpy(nram_p_grad + first_s * (cur_t_size + 1) + first_t,
-               gdram_ans_grad, sizeof(float), GDRAM2NRAM);
-    } else {
-      // otherwise, need to compute cur_p_grad:
-      // cur_p_grad(cur_p) = cur_px_grad + cur_py_grad
-      __bang_add(nram_cur_p, nram_cur_px, nram_cur_py, data_num);
-      // memcpy back to p_grad
-      __memcpy(nram_p_grad + first_s * (cur_t_size + 1) + first_t, nram_cur_p,
-               sizeof(float), NRAM2NRAM, cur_t_size * sizeof(float),
-               sizeof(float), data_num - 1);
-    }
-  }
-}
-
-__mlu_global__ void mluBlockDefaultMutualInformationBackward(
-    const int B, const int S, const int T, const int step_i,
-    const int job_num_on_step, const int s_block_num, const int t_block_num,
-    const int s_block_size, const int t_block_size, const float *px,
-    const float *py, const bool has_boundary, const int64_t *opt_boundary,
-    const float *p, const bool overwrite_ans_grad, float *ans_grad,
-    float *px_grad, float *py_grad, float *p_grad) {
-  /******************************** NRAM SPACE ******************************/
-  /* Load Init */
-  /*|---------------------------------------------------------------------|*/
-  /*| px,py |  p, p_grad  |large_neg    |         |         |             |*/
-  /*| 2*S*T |2*(S+1)*(T+1)| 2*min_len+1 | min_len | min_len | 2*min_len+1 |*/
-  /*|---------------------------------------------------------------------|*/
-  /* Compute term1 and term2 */
-  /*|------------------------------------------------------------------|*/
-  /*| px,py |  p          |large_neg,mask|cur_term1,2| cur_p | next_p  |*/
-  /*| 2*S*T |2*(S+1)*(T+1)| 2*min_len+1  | 2*min_len |min_len|min_len+1|*/
-  /*|------------------------------------------------------------------|*/
-  /* Compute px_grad, py_grad, p_grad */
-  /*|------------------------------------------------------------------------|*/
-  /*|px/y_grad|     p_grad  |           | cur_term1,2 |cur_p_grad|next_p_grad|*/
-  /*|         |             |           |cur_px/y_grad|          |           |*/
-  /*|  2*S*T  |2*(S+1)*(T+1)|2*min_len+1|  2*min_len  | min_len  | min_len+1 |*/
-  /*|------------------------------------------------------------------------|*/
-
-  // NOTE: s and t block size has already + 1 on S and T
-  int min_s_t_block_size = __mluop_min(s_block_size, t_block_size);
-
-  // px, term1, px_grad
-  float *nram_px_buf = (float *)nram_buffer;
-  // py, term2, py_grad
-  float *nram_py_buf = nram_px_buf + s_block_size * t_block_size;
-  // p block
-  float *nram_p = nram_py_buf + s_block_size * t_block_size;
-  // p_grad block
-  float *nram_p_grad = nram_p + (s_block_size + 1) * (t_block_size + 1);
-  // Initialize with float(1.0e+30) value, to maximum with p
-  float *nram_large_neg = nram_p_grad + (s_block_size + 1) * (t_block_size + 1);
-  // mask
-  float *nram_mask = nram_large_neg + min_s_t_block_size + 1;
-  // cur_px, cur_term1, cur_px_grad
-  float *nram_cur_px_buf = nram_mask + min_s_t_block_size;
-  // cur_py, cur_term2, cur_py_grad
-  float *nram_cur_py_buf = nram_cur_px_buf + min_s_t_block_size;
-  // cur_p, cur_p_grad
-  float *nram_cur_p = nram_cur_py_buf + min_s_t_block_size;
-  // next_p, next_p_grad
-  float *nram_next_p = nram_cur_p + min_s_t_block_size;
-
-  int batch_idx;
-  int batch_s_begin = 0;
-  int batch_t_begin = 0;
-  int batch_s_end = S;
-  int batch_t_end = T;
-  int cur_s_begin, cur_t_begin, cur_s_end, cur_t_end, cur_s_size, cur_t_size;
-  bool need_compute_ans_grad;
-
-  // According to has_boundary, calculate current job scope
-  bool need_return = calPartitionJobScope(
-      has_boundary, opt_boundary, B, S, T, step_i, job_num_on_step, s_block_num,
-      t_block_num, s_block_size, t_block_size, batch_idx, batch_s_begin,
-      batch_t_begin, batch_s_end, batch_t_end, cur_s_begin, cur_t_begin,
-      cur_s_end, cur_t_end, cur_s_size, cur_t_size, need_compute_ans_grad,
-      overwrite_ans_grad, px_grad, py_grad);
-  // Because taskDimX could change to taskDimY, so not all jobs need to compute
-  if (need_return) {
-    return;
-  }
-
-  // px_grad and px, py_grad and py, p_grad and p, have the same shape
-  const int px_one_batch_num = S * (T + 1);
-  const int py_one_batch_num = (S + 1) * T;
-  const int p_one_batch_num = (S + 1) * (T + 1);
-
-  const float *gdram_px = px + batch_idx * px_one_batch_num;
-  const float *gdram_py = py + batch_idx * py_one_batch_num;
-  const float *gdram_p = p + batch_idx * p_one_batch_num;
-
-  float *gdram_px_grad = px_grad + batch_idx * px_one_batch_num;
-  float *gdram_py_grad = py_grad + batch_idx * py_one_batch_num;
-  float *gdram_p_grad = p_grad + batch_idx * p_one_batch_num;
-  float *gdram_ans_grad = ans_grad + batch_idx;
-
-  const int min_s_t = __mluop_min(cur_s_size, cur_t_size);
-  // loadInit: load px, py, other block p,
-  //           or write -inf at last row of px, last column of py,
-  //              write large_neg at last row and column of p,
-  //           load other block p_grad,
-  //           or write 0 at last row/column of p_grad
-  loadInit(gdram_px, gdram_py, gdram_p, gdram_p_grad, nram_px_buf, nram_py_buf,
-           nram_p, nram_p_grad, S, T, batch_s_end, batch_t_end, cur_s_begin,
-           cur_t_begin, cur_s_end, cur_t_end, cur_s_size, cur_t_size);
-
-  // Initialize large_neg with value -1e+30
-  __nramset_async(nram_large_neg, min_s_t + 1, (float)-1.0e+30, 0, 0);
-  // sync for initialization async instructions
-  __sync();
-
-  // Compute term1, term2, p_grad, px_grad, py_grad
-  computeByDiagonal(nram_px_buf, nram_py_buf, nram_p, nram_p_grad,
-                    nram_cur_px_buf, nram_cur_py_buf, nram_cur_p, nram_next_p,
-                    nram_large_neg, nram_mask, gdram_ans_grad, batch_s_end,
-                    batch_t_end, cur_s_end, cur_t_end, cur_s_size, cur_t_size);
-
-  // Store:
-  // memcpy back p_grad(workspace)
-  __memcpy(gdram_p_grad + cur_s_begin * (T + 1) + cur_t_begin, nram_p_grad,
-           cur_t_size * sizeof(float), NRAM2GDRAM, (T + 1) * sizeof(float),
-           (cur_t_size + 1) * sizeof(float), cur_s_size - 1);
-  // memcpy back px_grad
-  if (cur_s_end < batch_s_end) {
-    // memcpy all px_grad data back
-    __memcpy(gdram_px_grad + cur_s_begin * (T + 1) + cur_t_begin, nram_px_buf,
-             cur_t_size * sizeof(float), NRAM2GDRAM, (T + 1) * sizeof(float),
-             cur_t_size * sizeof(float), cur_s_size - 1);
-  } else {
-    // memcpy px_grad data except last row
-    if (cur_s_size > 1) {
-      __memcpy(gdram_px_grad + cur_s_begin * (T + 1) + cur_t_begin, nram_px_buf,
-               cur_t_size * sizeof(float), NRAM2GDRAM, (T + 1) * sizeof(float),
-               cur_t_size * sizeof(float), cur_s_size - 2);
-    }
-  }
-  // memcpy back py_grad
-  if (cur_t_end < batch_t_end) {
-    // memcpy all py_grad data back
-    __memcpy(gdram_py_grad + cur_s_begin * T + cur_t_begin, nram_py_buf,
-             cur_t_size * sizeof(float), NRAM2GDRAM, T * sizeof(float),
-             cur_t_size * sizeof(float), cur_s_size - 1);
-  } else {
-    // memcpy py_grad data except last column
-    if (cur_t_size > 1) {
-      __memcpy(gdram_py_grad + cur_s_begin * T + cur_t_begin, nram_py_buf,
-               (cur_t_size - 1) * sizeof(float), NRAM2GDRAM, T * sizeof(float),
-               cur_t_size * sizeof(float), cur_s_size - 1);
-    }
-  }
-
-  // If last compute step, need store p_grad[s_begin, t_begin] to ans_grad
-  if (need_compute_ans_grad) {
-    ans_grad[batch_idx] = nram_p_grad[0];
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API kernelDefaultMutualInformationBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B,
-    const int S, const int T, const int step_i, const int job_num_on_step,
-    const int s_block_num, const int t_block_num, const int s_block_size,
-    const int t_block_size, const void *px, const void *py,
-    const bool has_boundary, const void *opt_boundary, const void *p,
-    const bool overwrite_ans_grad, void *ans_grad, void *px_grad, void *py_grad,
-    void *p_grad) {
-  KERNEL_CHECK(
-      mluBlockDefaultMutualInformationBackward<<<k_dim, k_type, queue>>>(
-          B, S, T, step_i, job_num_on_step, s_block_num, t_block_num,
-          s_block_size, t_block_size, (float *)px, (float *)py, has_boundary,
-          (int64_t *)opt_boundary, (float *)p, overwrite_ans_grad,
-          (float *)ans_grad, (float *)px_grad, (float *)py_grad,
-          (float *)p_grad));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/mutual_information_backward/mutual_information_backward_utils.h b/kernels/mutual_information_backward/mutual_information_backward_utils.h
deleted file mode 100644
index 19bbe306c..000000000
--- a/kernels/mutual_information_backward/mutual_information_backward_utils.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_UTILS_H_
-#define KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_UTILS_H_
-
-#include "mlu_op.h"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-__mlu_func__ void setNanInfToZero(float *src, float *mask, const int num) {
-  // band with 0x7F800000, exp bits are not all 1, mask -> 0xffffffff
-  __asm__ volatile(
-      "fuse.nram.s32 [%[dst]], %[size], [%[src0]],"
-      ".and(%[src1]), .ne(%[src2]), .mul(%[src3]);\n" ::[dst] "r"(
-          (int32_t *)mask),
-      [ size ] "r"(num), [ src0 ] "r"((int32_t *)src), [ src1 ] "r"(0x7f800000),
-      [ src2 ] "r"(0x7f800000), [ src3 ] "r"(-1));
-  __bang_band((char *)src, (char *)src, (char *)mask, num * sizeof(float));
-}
-
-__mlu_func__ void safeExp(float *dst, float *src, float *mask, const int num) {
-  setNanInfToZero(src, mask, num);
-  __mluop_exp(dst, src, NULL, 0, num);
-  // erase exp(0) to 0 with mask
-  __bang_band((char *)dst, (char *)dst, (char *)mask, num * sizeof(float));
-  setNanInfToZero(dst, mask, num);
-}
-
-#endif  // KERNELS_MUTUAL_INFORMATION_BACKWARD_MUTUAL_INFORMATION_BACKWARD_UTILS_H_  // NOLINT
diff --git a/kernels/mutual_information_forward/mutual_information_forward.cpp b/kernels/mutual_information_forward/mutual_information_forward.cpp
deleted file mode 100644
index 8b50f6164..000000000
--- a/kernels/mutual_information_forward/mutual_information_forward.cpp
+++ /dev/null
@@ -1,741 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "mutual_information_forward.h"
-
-#include <algorithm>
-#include <string>
-
-#include "core/context.h"
-#include "core/gen_case.h"
-#include "core/logging.h"
-#include "core/runtime/device.h"
-#include "core/tensor.h"
-#include "core/type.h"
-
-#define API_NAME "[mluOpMutualInformationForward]"
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetMutualInformationForwardWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc,
-    const mluOpTensorDescriptor_t ans_desc, size_t *workspace_size) {
-  PARAM_CHECK(API_NAME, handle != nullptr);
-  PARAM_CHECK(API_NAME, px_desc != nullptr);
-  PARAM_CHECK(API_NAME, py_desc != nullptr);
-  PARAM_CHECK(API_NAME, p_desc != nullptr);
-  PARAM_CHECK(API_NAME, ans_desc != nullptr);
-  PARAM_CHECK(API_NAME, workspace_size != nullptr);
-  // Workspace is not required in the current implementation.
-  *workspace_size = 0;
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorDim(
-    const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc,
-    const mluOpTensorDescriptor_t ans_desc) {
-  if (3 != px_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of px must be 3. "
-               << "But now the dim of px is " << px_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (3 != py_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of py must be 3. "
-               << "But now the dim of py is " << py_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (nullptr != opt_boundary_desc && 2 != opt_boundary_desc->dim) {
-    LOG(ERROR) << API_NAME
-               << " The dim of opt_boundary must be 2 when opt_boundary is "
-               << "not NULL. But now the dim of opt_boundary is "
-               << opt_boundary_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (3 != p_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of p must be 3. "
-               << "But now the dim of p is " << p_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-  if (1 != ans_desc->dim) {
-    LOG(ERROR) << API_NAME << " The dim of ans must be 1. "
-               << "But now the dim of ans is " << ans_desc->dim << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorShape(
-    const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc,
-    const mluOpTensorDescriptor_t ans_desc) {
-  const int B = px_desc->dims[0];
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-  if (B != py_desc->dims[0] || B != p_desc->dims[0] || B != ans_desc->dims[0]) {
-    LOG(ERROR) << API_NAME
-               << " px.shape[0], py.shape[0], p.shape[0], ans.shape[0], "
-               << "must be same. But now "
-               << "px.shape[0] is " << px_desc->dims[0] << ", py.shape[0] is "
-               << py_desc->dims[0] << ", p.shape[0] is " << p_desc->dims[0]
-               << ", ans.shape[0] is " << ans_desc->dims[0] << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // Currently only supports !modified, so the shape of px must be [B, S, T+1]
-  if (T + 1 != px_desc->dims[2]) {
-    LOG(ERROR) << API_NAME << " Currently only supports the case that "
-               << "px.shape[2] must be equal to py.shape[2] + 1. But now "
-               << "px.shape[2] is " << px_desc->dims[2] << ", py.shape[2] is "
-               << py_desc->dims[2] << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  // The shape of py must be [B, S+1, T]
-  if (S + 1 != py_desc->dims[1]) {
-    LOG(ERROR) << API_NAME << " py.shape[1] must be equal to px.shape[1] + 1. "
-               << "But now px.shape[1] is " << px_desc->dims[1]
-               << ", py.shape[1] is " << py_desc->dims[1] << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // The shape of opt_boundary must be [B, 4]
-  if (nullptr != opt_boundary_desc &&
-      (B != opt_boundary_desc->dims[0] || 4 != opt_boundary_desc->dims[1])) {
-    LOG(ERROR) << API_NAME << " When opt_boundary is not NULL, "
-               << "opt_boundary.shape[0] and px.shape[0] must be same, and "
-               << "opt_boundary.shape[1] must be 4. But now "
-               << "px.shape[0] is " << px_desc->dims[0]
-               << ", opt_boundary.shape[0] is " << opt_boundary_desc->dims[0]
-               << ", opt_boundary.shape[1] is " << opt_boundary_desc->dims[1]
-               << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  // The shape of p must be [B, S+1, T+1]
-  if (S + 1 != p_desc->dims[1] || T + 1 != p_desc->dims[2]) {
-    LOG(ERROR) << API_NAME << " p.shape[1] and py.shape[1] must be same, and "
-               << "p.shape[2] must be equal to py.shape[2] + 1. "
-               << "But now p.shape[1] is " << p_desc->dims[1]
-               << ", py.shape[1] is " << py_desc->dims[1] << ", p.shape[2] is "
-               << p_desc->dims[2] << ", py.shape[2] is " << py_desc->dims[2]
-               << ".";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorDatatype(
-    const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc,
-    const mluOpTensorDescriptor_t ans_desc) {
-  if (MLUOP_DTYPE_FLOAT != px_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of px currently only support float. But now "
-               << "the data type of px is "
-               << mluOpGetNameOfDataType(px_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (MLUOP_DTYPE_FLOAT != py_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of py currently only support float. But now "
-               << "the data type of py is "
-               << mluOpGetNameOfDataType(py_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (nullptr != opt_boundary_desc &&
-      MLUOP_DTYPE_INT64 != opt_boundary_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of opt_boundary currently only support int64."
-               << " But now the data type of opt_boundary is "
-               << mluOpGetNameOfDataType(opt_boundary_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (MLUOP_DTYPE_FLOAT != p_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of p currently only support float. But now "
-               << "the data type of p is "
-               << mluOpGetNameOfDataType(p_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-  if (MLUOP_DTYPE_FLOAT != ans_desc->dtype) {
-    LOG(ERROR) << API_NAME
-               << "The data type of ans currently only support float. "
-               << "But now the data type of ans is "
-               << mluOpGetNameOfDataType(ans_desc->dtype) << ".";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorScaleLimit(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc,
-    const mluOpTensorDescriptor_t py_desc,
-    const mluOpTensorDescriptor_t opt_boundary_desc,
-    const mluOpTensorDescriptor_t p_desc) {
-  // check large tensor
-  if (mluOpGetTensorElementNum(px_desc) >= LARGE_TENSOR_NUM ||
-      mluOpGetTensorElementNum(py_desc) >= LARGE_TENSOR_NUM ||
-      (nullptr != opt_boundary_desc &&
-       mluOpGetTensorElementNum(opt_boundary_desc) >= LARGE_TENSOR_NUM) ||
-      mluOpGetTensorElementNum(p_desc) >= LARGE_TENSOR_NUM) {
-    LOG(ERROR) << API_NAME << " Overflow max tensor num."
-               << " Current operator supports tensor num smaller than 2^31.";
-    return MLUOP_STATUS_NOT_SUPPORTED;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t checkTensorPtr(
-    const void *px, const void *py, const void *p, const void *ans,
-    const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary,
-    const int S, const int T, bool &has_boundary) {
-  if (S > 0) {
-    PARAM_CHECK(API_NAME, px != nullptr);
-  } else {
-    VLOG(5) << API_NAME << " px.shape[1] is zero.";
-  }
-
-  if (T > 0) {
-    PARAM_CHECK(API_NAME, py != nullptr);
-  } else {
-    VLOG(5) << API_NAME << " py.shape[2] is zero.";
-  }
-
-  PARAM_CHECK(API_NAME, p != nullptr);
-  PARAM_CHECK(API_NAME, ans != nullptr);
-
-  if (nullptr != opt_boundary_desc && nullptr != opt_boundary) {
-    has_boundary = true;
-    VLOG(5) << API_NAME << " opt_boundary is not NULL.";
-  } else if (nullptr == opt_boundary_desc && nullptr == opt_boundary) {
-    has_boundary = false;
-    VLOG(5) << API_NAME << " opt_boundary is NULL.";
-  } else {
-    LOG(ERROR) << API_NAME
-               << " opt_boundary_desc and opt_boundary must both be NULL, "
-               << "or both not be NULL.";
-    return MLUOP_STATUS_BAD_PARAM;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static mluOpStatus_t mutualInformationForwardParamCheck(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary,
-    const mluOpTensorDescriptor_t p_desc, const void *p, void *workspace,
-    const size_t workspace_size, const mluOpTensorDescriptor_t ans_desc,
-    void *ans, bool &has_boundary, bool &zero_element) {
-  // 1. check handle and tensor_desc
-  PARAM_CHECK(API_NAME, handle != nullptr);
-  PARAM_CHECK(API_NAME, px_desc != nullptr);
-  PARAM_CHECK(API_NAME, py_desc != nullptr);
-  PARAM_CHECK(API_NAME, p_desc != nullptr);
-  PARAM_CHECK(API_NAME, ans_desc != nullptr);
-
-  // since the layout of all tensor is ARRAY, so skip check tensor layout
-
-  // 2. check mlu platform
-  if (handle->arch < 372) {
-    LOG(ERROR) << API_NAME << " Only mlu300 and above devices are supported."
-               << " Please check the device version!";
-    return MLUOP_STATUS_ARCH_MISMATCH;
-  }
-
-  // 3. check tensor dim
-  mluOpStatus_t check_status =
-      checkTensorDim(px_desc, py_desc, opt_boundary_desc, p_desc, ans_desc);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  // 4. check tensor shape
-  check_status =
-      checkTensorShape(px_desc, py_desc, opt_boundary_desc, p_desc, ans_desc);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  // 5. check tensor dtype
-  check_status = checkTensorDatatype(px_desc, py_desc, opt_boundary_desc,
-                                     p_desc, ans_desc);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  // 6. check scale limit, for large tensor
-  check_status = checkTensorScaleLimit(handle, px_desc, py_desc,
-                                       opt_boundary_desc, p_desc);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  const int B = px_desc->dims[0];
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-
-  // 7. check zero element.
-  if (0 == B) {
-    zero_element = true;
-    VLOG(5) << API_NAME
-            << " Skip zero element tensor when px.shape[0] is zero.";
-    return MLUOP_STATUS_SUCCESS;
-  }
-
-  // 8 check workspace
-  if (workspace_size > 0) {
-    PARAM_CHECK(API_NAME, workspace != nullptr);
-  }
-
-  // 9. check tensor ptr
-  check_status = checkTensorPtr(px, py, p, ans, opt_boundary_desc, opt_boundary,
-                                S, T, has_boundary);
-  if (MLUOP_STATUS_SUCCESS != check_status) {
-    return check_status;
-  }
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-static void mutualInformationForwardGencase(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary,
-    const mluOpTensorDescriptor_t p_desc, const void *p,
-    const mluOpTensorDescriptor_t ans_desc, void *ans) {
-  GEN_CASE_START("mutual_information_forward");
-  GEN_CASE_HANDLE(handle);
-
-  GEN_CASE_DATA(true, "px", px, px_desc, -1, 1);
-  GEN_CASE_DATA(true, "py", py, py_desc, -1, 1);
-  if (nullptr != opt_boundary) {
-    GEN_CASE_DATA_REAL(true, "opt_boundary", opt_boundary, opt_boundary_desc);
-  }
-  GEN_CASE_DATA(true, "p", p, p_desc, -1, 1);
-  GEN_CASE_DATA(false, "p", p, p_desc, -1, 1);
-  GEN_CASE_DATA(false, "ans", ans, ans_desc, -1, 1);
-  GEN_CASE_TEST_PARAM_NEW(true, true, false, 0.003, 0.003, 0);
-}
-
-static void policyFunc3Pipeline(const mluOpHandle_t handle, cnrtDim3_t *k_dim,
-                                cnrtFunctionType_t *k_type, int batch_size) {
-  int core_num = mluop::runtime::getClusterLimitCapability(handle) *
-                 mluop::runtime::getCoreNumOfEachUnionCapability(handle);
-  *k_type = CNRT_FUNC_TYPE_BLOCK;
-  k_dim->x = 1;
-  k_dim->y = batch_size < core_num ? batch_size : core_num;
-  k_dim->z = 1;
-}
-
-static mluOpStatus_t launchMutualInformationForward3PipelineKernel(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const bool has_boundary, const void *opt_boundary, void *p, void *ans) {
-  const int B = px_desc->dims[0];
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc3Pipeline(handle, &k_dim, &k_type, B);
-  VLOG(5) << "Launch Kernel 3PipelineMutualInformationForward<<<Block "
-          << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-  CHECK_RETURN("[MutualInformationForward]",
-               kernel3PipelineMutualInformationForward(
-                   k_dim, k_type, handle->queue, B, S, T, px, py, has_boundary,
-                   opt_boundary, p, ans));
-
-  return MLUOP_STATUS_SUCCESS;
-}
-
-// Calculate computing diagonal number of partition mode for default kernel
-static void calComputingDiags(const int S, const int T,
-                              int64_t *computing_diag_num, int *s_block_size,
-                              int *t_block_size, int *s_repeat, int *t_repeat,
-                              int *s_remainder, int *t_remainder,
-                              const int mode) {
-  // If has remainder part, rearrange block size to balance work load
-  s_repeat[mode] = S / s_block_size[mode];
-  s_remainder[mode] = S % s_block_size[mode];
-  if (s_remainder[mode] > 0) {
-    s_block_size[mode] = S / (s_repeat[mode] + 1);
-    s_repeat[mode] = S / s_block_size[mode];
-    s_remainder[mode] = S % s_block_size[mode];
-  }
-
-  t_repeat[mode] = T / t_block_size[mode];
-  t_remainder[mode] = T % t_block_size[mode];
-  if (t_remainder[mode] > 0) {
-    t_block_size[mode] = T / (t_repeat[mode] + 1);
-    t_repeat[mode] = T / t_block_size[mode];
-    t_remainder[mode] = T % t_block_size[mode];
-  }
-
-  // Accumulate all block's computing diagonal numbers
-  computing_diag_num[mode] = s_repeat[mode] * t_repeat[mode] *
-                             (s_block_size[mode] + t_block_size[mode] - 1);
-  if (s_remainder[mode] > 0) {
-    computing_diag_num[mode] +=
-        t_repeat[mode] * (t_block_size[mode] + s_remainder[mode] - 1);
-  }
-
-  if (t_remainder[mode] > 0) {
-    computing_diag_num[mode] +=
-        s_repeat[mode] * (s_block_size[mode] + t_remainder[mode] - 1);
-  }
-
-  if (s_remainder[mode] > 0 && t_remainder[mode] > 0) {
-    computing_diag_num[mode] += s_remainder[mode] + t_remainder[mode] - 1;
-  }
-}
-
-static void assignPartitionParams(const int *s_block_size,
-                                  const int *t_block_size, const int *s_repeat,
-                                  const int *t_repeat, const int *s_remainder,
-                                  const int *t_remainder,
-                                  int &final_s_block_size,
-                                  int &final_t_block_size, int &final_s_repeat,
-                                  int &final_t_repeat, int &final_s_remainder,
-                                  int &final_t_remainder, const int mode) {
-  final_s_block_size = s_block_size[mode];
-  final_t_block_size = t_block_size[mode];
-  final_s_repeat = s_repeat[mode];
-  final_t_repeat = t_repeat[mode];
-  final_s_remainder = s_remainder[mode];
-  final_t_remainder = t_remainder[mode];
-}
-
-static void calDefaultPartition(const int S, const int T, const int N_size,
-                                const int nram_size, int &job_diag_num,
-                                int &final_s_block_size,
-                                int &final_t_block_size, int &final_s_repeat,
-                                int &final_t_repeat, int &final_s_remainder,
-                                int &final_t_remainder) {
-  // Compute each partition's job diagonal number,
-  // and choose the partition method with the least job diagonal number:
-  // 1) all S and T, no partition, launch once in one batch;
-  // 2) S < max_N_size, compare with (S, t) and (S/2, t);
-  // 3) T < max_N_size, compare with (s, T) and (s, T/2);
-  // 4) both S and T > max_N_size, compare with (N, N), (S, t), (s, T), if
-  // exist;
-  if (S <= N_size && T <= N_size) {
-    // once can compute all SxT onchip
-    job_diag_num = 1;
-    final_s_block_size = S;
-    final_t_block_size = T;
-    final_s_repeat = 1;
-    final_t_repeat = 1;
-    final_s_remainder = 0;
-    final_t_remainder = 0;
-    return;
-  } else {
-    // Sum of each partition's number of computing diagonals
-    // at most 3 arrays of candidate partition mode
-    int mode;
-    int64_t computing_diag_num[3] = {0};
-    int s_block_size[3] = {0};
-    int t_block_size[3] = {0};
-    int s_repeat[3] = {0};
-    int t_repeat[3] = {0};
-    int s_remainder[3] = {0};
-    int t_remainder[3] = {0};
-
-    if (S <= N_size && T > N_size) {
-      // compare with (S, t) and (S/2, t)
-      // 1) deal_s = S; min(s, t) = s;
-      mode = 0;
-      s_block_size[0] = S;
-      t_block_size[0] = (nram_size / sizeof(float) - 7 * s_block_size[0]) /
-                        (3 * s_block_size[0] + 1);
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      // 2) deal_s = S/2; min(s, t) = s;
-      mode = 1;
-      s_block_size[1] = std::max(S / 2, 1);  // at least 1 number in s_block
-      t_block_size[1] = (nram_size / sizeof(float) - 7 * s_block_size[1]) /
-                        (3 * s_block_size[1] + 1);
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-
-      if (computing_diag_num[0] <= computing_diag_num[1]) {
-        assignPartitionParams(
-            s_block_size, t_block_size, s_repeat, t_repeat, s_remainder,
-            t_remainder, final_s_block_size, final_t_block_size, final_s_repeat,
-            final_t_repeat, final_s_remainder, final_t_remainder, 0);
-      } else {
-        assignPartitionParams(
-            s_block_size, t_block_size, s_repeat, t_repeat, s_remainder,
-            t_remainder, final_s_block_size, final_t_block_size, final_s_repeat,
-            final_t_repeat, final_s_remainder, final_t_remainder, 1);
-      }
-    } else if (S > N_size && T <= N_size) {
-      // compare with (s, T) and (s, T/2)
-      // 1) deal_t = T; min(s, t) = t;
-      mode = 0;
-      t_block_size[0] = T;
-      s_block_size[0] = (nram_size / sizeof(float) - 7 * t_block_size[0]) /
-                        (3 * t_block_size[0] + 1);
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      // 2) deal_t = T/2; min(s, t) = t;
-      mode = 1;
-      t_block_size[1] = std::max(T / 2, 1);  // at least 1 number in t_block
-      s_block_size[1] = (nram_size / sizeof(float) - 7 * t_block_size[1]) /
-                        (3 * t_block_size[1] + 1);
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-
-      if (computing_diag_num[0] <= computing_diag_num[1]) {
-        assignPartitionParams(
-            s_block_size, t_block_size, s_repeat, t_repeat, s_remainder,
-            t_remainder, final_s_block_size, final_t_block_size, final_s_repeat,
-            final_t_repeat, final_s_remainder, final_t_remainder, 0);
-      } else {
-        assignPartitionParams(
-            s_block_size, t_block_size, s_repeat, t_repeat, s_remainder,
-            t_remainder, final_s_block_size, final_t_block_size, final_s_repeat,
-            final_t_repeat, final_s_remainder, final_t_remainder, 1);
-      }
-    } else {  // S > N_size, T > N_size, choose between (N,N), (S,t), (s,T)
-      // 1) deal_s = deal_t = N_size; min(s,t) = s = t;
-      mode = 0;
-      s_block_size[0] = N_size;
-      t_block_size[0] = N_size;
-      calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                        s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      // 2) deal_s = S, deal_t = t; min(s,t) = t;
-      mode = 1;
-      s_block_size[1] = N_size;
-      t_block_size[1] = (nram_size / sizeof(float) - 1 * s_block_size[1]) /
-                        (3 * s_block_size[1] + 7);
-      if (t_block_size[1] > 0) {
-        calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                          s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      } else {
-        computing_diag_num[1] = -1;  // not support on this partition
-      }
-      // 3) deal_t = T, deal_s = s; min(s,t) = s;
-      mode = 2;
-      t_block_size[2] = T;
-      s_block_size[2] = (nram_size / sizeof(float) - 1 * t_block_size[2]) /
-                        (3 * t_block_size[2] + 7);
-      if (s_block_size[2] > 0) {
-        calComputingDiags(S, T, computing_diag_num, s_block_size, t_block_size,
-                          s_repeat, t_repeat, s_remainder, t_remainder, mode);
-      } else {
-        computing_diag_num[2] = -1;  // not support on this partition
-      }
-
-      if (computing_diag_num[0] > 0 &&      // mode 0 is valid
-          ((computing_diag_num[1] <= 0) ||  // mode 1 is invalid or
-           computing_diag_num[0] <=
-               computing_diag_num[1])) {  // mode 0 is better than mode 1
-        if (computing_diag_num[2] > 0 &&  // mode 2 is valid and
-            computing_diag_num[2] <
-                computing_diag_num[0]) {  // mode 2 is better than mode 0
-          // choose mode 2
-          assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat,
-                                s_remainder, t_remainder, final_s_block_size,
-                                final_t_block_size, final_s_repeat,
-                                final_t_repeat, final_s_remainder,
-                                final_t_remainder, 2);
-        } else {
-          // choose mode 0
-          assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat,
-                                s_remainder, t_remainder, final_s_block_size,
-                                final_t_block_size, final_s_repeat,
-                                final_t_repeat, final_s_remainder,
-                                final_t_remainder, 0);
-        }
-      } else {  // mode 1 is valid and mode 1 is better than mode 0
-        if (computing_diag_num[2] > 0 &&  // mode 2 is valid
-            computing_diag_num[2] <
-                computing_diag_num[1]) {  // mode 2 is better than mode 1
-          // choose mode 2
-          assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat,
-                                s_remainder, t_remainder, final_s_block_size,
-                                final_t_block_size, final_s_repeat,
-                                final_t_repeat, final_s_remainder,
-                                final_t_remainder, 2);
-        } else {
-          // choose mode 1
-          assignPartitionParams(s_block_size, t_block_size, s_repeat, t_repeat,
-                                s_remainder, t_remainder, final_s_block_size,
-                                final_t_block_size, final_s_repeat,
-                                final_t_repeat, final_s_remainder,
-                                final_t_remainder, 1);
-        }
-      }
-    }
-    // total job diagonal number in parallel
-    job_diag_num = final_s_repeat + (int)(final_s_remainder > 0) +
-                   final_t_repeat + (int)(final_t_remainder > 0) - 1;
-  }
-}
-
-static mluOpStatus_t launchMutualInformationForwardDefaultKernel(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const bool has_boundary, const void *opt_boundary, void *p, void *ans) {
-  // When S and T is too large, launch default kernel with partition of S and T
-  // 1. Compute current arch max N size, according to NRAM size and device RAM
-  // 2. Use max_N_size to calculate different partition mode computing diagonal
-  //    numbers and choose the partition mode, which has the least computing
-  //    diagonal number
-  // 3. Launch default kernels by diagonal in parallel, with check of MaxDimX
-
-  const int B = px_desc->dims[0];
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  // 1. According to on-chip RAM size, calculate current arch partition block
-  // size by square, Use max_N_size to partition on S and T dimension RAM space:
-  //   (S+1)*(T+1) + S*T + S*T + 3*min(S,T) + 3*min(S,T)+1
-  int max_N_size = (int)(std::sqrt(handle->nram_size / sizeof(float) / 3)) - 2;
-  // Use max square size N, partition on T and S dimension, launch by diagonal:
-  // -|------T--------|
-  // :| N1| N2| N3| N4|
-  // :|---|---|---|---|
-  // S| N2| N3| N4| N5|
-  // :|---|---|---|---|
-  // :| N3| N4| N5| N6|
-  // -|---------------|
-
-  VLOG(5) << "Current arch Max square N size is " << max_N_size;
-
-  int job_diag_num;  // number of default kernel launch steps by diagonal
-  int s_block_size, t_block_size, s_repeat, t_repeat, s_remainder, t_remainder;
-
-  // 2. Choose the partition mode, which has the least computing diagonal number
-  // NOTE: p has dimension (S+1, T+1), in function directly use (S, T) instead
-  calDefaultPartition(S + 1, T + 1, max_N_size, handle->nram_size, job_diag_num,
-                      s_block_size, t_block_size, s_repeat, t_repeat,
-                      s_remainder, t_remainder);
-  int s_block_num = s_repeat + (int)(s_remainder > 0);
-  int t_block_num = t_repeat + (int)(t_remainder > 0);
-  int max_s_t_block_num = std::max(s_block_num, t_block_num);
-  int min_s_t_block_num = std::min(s_block_num, t_block_num);
-
-  k_type = CNRT_FUNC_TYPE_BLOCK;
-  k_dim.y = 1;
-  k_dim.z = 1;
-  // Get current arch support max dim_x value
-  int task_dim_x_limit;
-  cnDeviceGetAttribute(&task_dim_x_limit,
-                       CN_DEVICE_ATTRIBUTE_MAX_BLOCK_TASK_DIM_X,
-                       handle->device);
-  VLOG(5) << "Current arch MAX_BLOCK_TASK_DIM_X is " << task_dim_x_limit;
-
-  // 3. Traverse step_i from 0 to (job_diag_num - 1)
-  for (int step_i = 0; step_i < job_diag_num; step_i++) {
-    int job_num_on_step = B * (step_i < max_s_t_block_num
-                                   ? std::min(step_i + 1, min_s_t_block_num)
-                                   : s_block_num + t_block_num - step_i - 1);
-    k_dim.x = job_num_on_step;
-    // Make sure not exceed max dim x limit
-    if (k_dim.x > task_dim_x_limit) {
-      int task_dim_change = (k_dim.x + task_dim_x_limit - 1) / task_dim_x_limit;
-      k_dim.x = (k_dim.x + task_dim_x_limit - 1) / task_dim_change;
-      k_dim.y = k_dim.y * task_dim_change;
-    }
-
-    VLOG(5) << "Launch Kernel DefaultMutualInformationForward<<< step "
-            << step_i << " of Batch Block: " << k_dim.x << ", " << k_dim.y
-            << ", " << k_dim.z << ">>>";
-    CHECK_RETURN("[MutualInformationForward]",
-                 kernelDefaultMutualInformationForward(
-                     k_dim, k_type, handle->queue, B, S, T, step_i,
-                     job_num_on_step, s_block_num, t_block_num, s_block_size,
-                     t_block_size, px, py, has_boundary, opt_boundary, p, ans));
-  }
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpMutualInformationForward(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t px_desc, const void *px,
-    const mluOpTensorDescriptor_t py_desc, const void *py,
-    const mluOpTensorDescriptor_t opt_boundary_desc, const void *opt_boundary,
-    const mluOpTensorDescriptor_t p_desc, void *p, void *workspace,
-    const size_t workspace_size, const mluOpTensorDescriptor_t ans_desc,
-    void *ans) {
-  // 1. Paramcheck
-  bool has_boundary = false;
-  bool zero_element = false;
-  mluOpStatus_t check_status = mutualInformationForwardParamCheck(
-      handle, px_desc, px, py_desc, py, opt_boundary_desc, opt_boundary, p_desc,
-      p, workspace, workspace_size, ans_desc, ans, has_boundary, zero_element);
-
-  if (MLUOP_STATUS_SUCCESS != check_status || zero_element) {
-    return check_status;
-  }
-
-  // 2. Generate case
-  if (MLUOP_GEN_CASE_ON_NEW) {
-    mutualInformationForwardGencase(handle, px_desc, px, py_desc, py,
-                                    opt_boundary_desc, opt_boundary, p_desc, p,
-                                    ans_desc, ans);
-  }
-
-  // Choose to launch 3pipeline kernel or default kernel
-  const int S = px_desc->dims[1];
-  const int T = py_desc->dims[2];
-  bool is_launch_3pipeline = true;
-
-  // Check 3pipeline kernel scale limit for computing p
-  // 9: max_val, mask, temp, ping(py, px, p) and pong(py, px, p)
-  // 11: max_val, mask, temp, ping(py, px, p), pong(py, px, p) and 2*(-inf)
-  int current_size =
-      T * (S + 1) + (T + 1) * S + (T + 1) * (S + 1) + 9 * std::min(S, T) + 11;
-  if (current_size > handle->nram_size / sizeof(float)) {
-    is_launch_3pipeline = false;
-  }
-
-  // 3. Launch kernel
-  mluOpStatus_t return_status;
-  if (is_launch_3pipeline) {
-    // launch 3pipeline kernel when satisfy scale limit
-    return_status = launchMutualInformationForward3PipelineKernel(
-        handle, px_desc, px, py_desc, py, has_boundary, opt_boundary, p, ans);
-  } else {
-    // launch default kernel
-    return_status = launchMutualInformationForwardDefaultKernel(
-        handle, px_desc, px, py_desc, py, has_boundary, opt_boundary, p, ans);
-  }
-
-  GEN_CASE_END();
-  return return_status;
-}
diff --git a/kernels/mutual_information_forward/mutual_information_forward.h b/kernels/mutual_information_forward/mutual_information_forward.h
deleted file mode 100644
index 42df0dc9b..000000000
--- a/kernels/mutual_information_forward/mutual_information_forward.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_H_
-#define KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_H_
-
-#include "mlu_op.h"
-#include "kernels/kernel.h"
-
-mluOpStatus_t MLUOP_WIN_API kernel3PipelineMutualInformationForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B,
-    const int S, const int T, const void *px, const void *py,
-    const bool has_boundary, const void *opt_boundary, void *p, void *ans);
-
-mluOpStatus_t MLUOP_WIN_API kernelDefaultMutualInformationForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B,
-    const int S, const int T, const int step_i, const int job_num_on_step,
-    const int s_block_num, const int t_block_num, const int s_block_size,
-    const int t_block_size, const void *px, const void *py,
-    const bool has_boundary, const void *opt_boundary, void *p, void *ans);
-
-#endif  // KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_H_
diff --git a/kernels/mutual_information_forward/mutual_information_forward_3pipeline_block.mlu b/kernels/mutual_information_forward/mutual_information_forward_3pipeline_block.mlu
deleted file mode 100644
index f3075cb32..000000000
--- a/kernels/mutual_information_forward/mutual_information_forward_3pipeline_block.mlu
+++ /dev/null
@@ -1,227 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "mutual_information_forward.h"
-
-#include "core/logging.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-#include "kernels/mutual_information_forward/mutual_information_forward_utils.h"
-
-__mlu_func__ void pipelineLoad(float *nram_px, float *nram_py, const int T,
-                               const int s_len, const int t_len,
-                               const int max_len, const int min_len,
-                               const int s_begin, const int t_begin,
-                               const int t_end, const int diagonal_num,
-                               const int i, float *ping) {
-  int data_num = i < max_len ? __mluop_min(i + 1, min_len) : diagonal_num - i;
-  int s = i - 1 < t_len ? s_begin : i - t_len + s_begin;
-  int t = i - 1 < t_len ? i + t_begin - 1 : t_end;
-  int px_num = i < t_len ? data_num - 1 : data_num;
-  if (px_num > 0) {
-    __memcpy_async(ping + min_len + 1, nram_px + s * (T + 1) + t, sizeof(float),
-                   NRAM2NRAM, sizeof(float), T * sizeof(float), px_num - 1);
-  }
-
-  int py_num = i < s_len ? data_num - 1 : data_num;
-  if (i >= t_len) {
-    s += 1;
-    t -= 1;
-  }
-
-  if (py_num > 0) {
-    __memcpy_async(ping, nram_py + s * T + t, sizeof(float), NRAM2NRAM,
-                   sizeof(float), (T - 1) * sizeof(float), py_num - 1);
-  }
-}
-
-__mlu_func__ void pipelineCompute(const int s_len, const int t_len,
-                                  const int max_len, const int min_len,
-                                  const int diagonal_num, const int i,
-                                  float *max_value, float *mask, float *temp,
-                                  float *pong_p, float *ping) {
-  float *ping_py = ping;
-  float *ping_px = ping_py + min_len + 1;
-  float *ping_p = ping_px + min_len;
-
-  int data_num = i < max_len ? __mluop_min(i + 1, min_len) : diagonal_num - i;
-
-  int px_num = i < t_len ? data_num - 1 : data_num;
-  if (px_num > 0) {
-    __bang_add(ping_px, ping_px, pong_p, px_num);
-  }
-
-  int py_num = i < s_len ? data_num - 1 : data_num;
-  if (i >= t_len) {
-    pong_p += 1;
-  }
-  if (py_num > 0) {
-    __bang_add(ping_py, ping_py, pong_p, py_num);
-  }
-
-  ping_px = i < t_len ? ping_px - 1 : ping_px;
-  logAddVector(ping_p, ping_px, ping_py, max_value, mask, temp, data_num);
-
-  __bang_write_value(ping, 2 * min_len + 1, -INFINITY);
-}
-
-__mlu_func__ void pipelineStore(float *nram_p, const int T, const int t_len,
-                                const int max_len, const int min_len,
-                                const int s_begin, const int t_begin,
-                                const int t_end, const int diagonal_num,
-                                const int i, float *ping_p) {
-  int data_num = i < max_len ? __mluop_min(i + 1, min_len) : diagonal_num - i;
-  int s = i < t_len ? s_begin : i - t_len + 1 + s_begin;
-  int t = i < t_len ? i + t_begin : t_end;
-  __memcpy_async(nram_p + s * (T + 1) + t, ping_p, sizeof(float), NRAM2NRAM,
-                 T * sizeof(float), sizeof(float), data_num - 1);
-}
-
-__mlu_func__ void compute3PipelineMutualInformation(
-    const int b, const int S, const int T, const bool has_boundary,
-    const int s_begin, const int s_end, const int t_begin, const int t_end,
-    const float *px, const float *py, float *p, float *ans) {
-  /* *********************nram space split********************** */
-  /* |--------------------------COMMON-------------------------| */
-  /* |  px   |  py   |     p     | max_val |  mask   |  temp   | */
-  /* |S*(T+1)|(S+1)*T|(S+1)*(T+1)| min_len | min_len | min_len | */
-  /* |------------PING------------|------------PONG------------| */
-  /* | cur_py|-inf|cur_px | cur_p | cur_py|-inf|cur_px | cur_p | */
-  /* |min_len| 1  |min_len|min_len|min_len| 1  |min_len|min_len| */
-  const int px_one_batch_size = S * (T + 1);
-  const int py_one_batch_size = (S + 1) * T;
-  const int p_one_batch_size = (S + 1) * (T + 1);
-
-  float *nram_px = (float *)nram_buffer;
-  float *nram_py = nram_px + px_one_batch_size;
-  float *nram_p = nram_py + py_one_batch_size;
-
-  if (S > 0) {
-    __memcpy(nram_px, px + b * px_one_batch_size,
-             px_one_batch_size * sizeof(float), GDRAM2NRAM);
-  }
-
-  if (T > 0) {
-    __memcpy(nram_py, py + b * py_one_batch_size,
-             py_one_batch_size * sizeof(float), GDRAM2NRAM);
-  }
-
-  if (has_boundary) {
-    __memcpy(nram_p, p + b * p_one_batch_size, p_one_batch_size * sizeof(float),
-             GDRAM2NRAM);
-  }
-
-  const int s_len = s_end - s_begin + 1;
-  const int t_len = t_end - t_begin + 1;
-  const int max_len = __mluop_max(s_len, t_len);
-  const int min_len = __mluop_min(s_len, t_len);
-  const int ping_pong_gap = 3 * min_len + 1;
-
-  float *nram_max_value = nram_p + p_one_batch_size;
-  float *nram_mask = nram_max_value + min_len;
-  float *nram_temp = nram_mask + min_len;
-
-  float *ping = nram_temp + min_len;
-  float *ping_p = ping + 2 * min_len + 1;
-
-  __bang_write_value(ping, ping_pong_gap * 2, -INFINITY);
-
-  nram_p[s_begin * (T + 1) + t_begin] = (float)0;
-  ping_p[ping_pong_gap] = (float)0;
-
-  __sync();
-
-  int repeat = s_len + t_len - 2;
-  for (int i = 0; i < repeat + 2; ++i) {
-    if (i < repeat) {
-      pipelineLoad(nram_px, nram_py, T, s_len, t_len, max_len, min_len, s_begin,
-                   t_begin, t_end, repeat + 1, i + 1,
-                   ping + (i % 2) * ping_pong_gap);
-    }
-
-    if (i > 0 && i <= repeat) {
-      pipelineCompute(s_len, t_len, max_len, min_len, repeat + 1, i,
-                      nram_max_value, nram_mask, nram_temp,
-                      ping_p + (i % 2) * ping_pong_gap,
-                      ping + ((i - 1) % 2) * ping_pong_gap);
-    }
-
-    if (i > 1) {
-      pipelineStore(nram_p, T, t_len, max_len, min_len, s_begin, t_begin, t_end,
-                    repeat + 1, i - 1, ping_p + (i % 2) * ping_pong_gap);
-    }
-    __sync();
-  }
-
-  __memcpy(ans + b, nram_p + s_end * (T + 1) + t_end, sizeof(float),
-           NRAM2GDRAM);
-  __memcpy(p + b * p_one_batch_size, nram_p, p_one_batch_size * sizeof(float),
-           NRAM2GDRAM);
-}
-
-__mlu_global__ void mluBlock3PipelineMutualInformationForward(
-    const int B, const int S, const int T, const float *px, const float *py,
-    const bool has_boundary, const int64_t *opt_boundary, float *p,
-    float *ans) {
-  const int num_per_core = B / taskDim;
-  const int num_rem = B % taskDim;
-  const int num_cur_core = num_per_core + (taskId < num_rem);
-  const int b_offset = taskId * num_cur_core + (taskId >= num_rem) * num_rem;
-
-  int s_begin = 0;
-  int t_begin = 0;
-  int s_end = S;
-  int t_end = T;
-  if (has_boundary) {
-    int64_t *boundary = (int64_t *)nram_buffer;
-    for (int b = b_offset; b < b_offset + num_cur_core; ++b) {
-      __memcpy(boundary, opt_boundary + 4 * b, 4 * sizeof(int64_t), GDRAM2NRAM);
-      s_begin = boundary[0];
-      t_begin = boundary[1];
-      s_end = boundary[2];
-      t_end = boundary[3];
-
-      if (s_begin > s_end || t_begin > t_end) {
-        ans[b] = 0.0;
-        continue;
-      }
-      compute3PipelineMutualInformation(b, S, T, has_boundary, s_begin, s_end,
-                                        t_begin, t_end, px, py, p, ans);
-    }
-  } else {
-    for (int b = b_offset; b < b_offset + num_cur_core; ++b) {
-      compute3PipelineMutualInformation(b, S, T, has_boundary, s_begin, s_end,
-                                        t_begin, t_end, px, py, p, ans);
-    }
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API kernel3PipelineMutualInformationForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B,
-    const int S, const int T, const void *px, const void *py,
-    const bool has_boundary, const void *opt_boundary, void *p, void *ans) {
-  KERNEL_CHECK(
-      mluBlock3PipelineMutualInformationForward<<<k_dim, k_type, queue>>>(
-          B, S, T, (float *)px, (float *)py, has_boundary,
-          (int64_t *)opt_boundary, (float *)p, (float *)ans));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/mutual_information_forward/mutual_information_forward_default_block.mlu b/kernels/mutual_information_forward/mutual_information_forward_default_block.mlu
deleted file mode 100644
index f66fc93c8..000000000
--- a/kernels/mutual_information_forward/mutual_information_forward_default_block.mlu
+++ /dev/null
@@ -1,307 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "mutual_information_forward.h"
-
-#include "core/logging.h"
-#include "kernels/kernel.h"
-#include "kernels/utils/common.h"
-#include "kernels/mutual_information_forward/mutual_information_forward_utils.h"
-
-__mlu_func__ bool calPartitionJobScope(
-    bool has_boundary, const int64_t *opt_boundary, const int B, const int S,
-    const int T, const int step_i, const int job_num_on_step,
-    const int s_block_num, const int t_block_num, const int s_block_size,
-    const int t_block_size, int &batch_idx, int &batch_s_begin,
-    int &batch_t_begin, int &batch_s_end, int &batch_t_end, int &cur_s_begin,
-    int &cur_t_begin, int &cur_s_end, int &cur_t_end, int &cur_s_size,
-    int &cur_t_size, bool &need_compute_ans, float *ans) {
-  int job_num_on_batch = job_num_on_step / B;  // Each batch job num
-  batch_idx = taskId / job_num_on_batch;       // Current job on which batch
-  int block_id_in_batch =
-      taskId - batch_idx * job_num_on_batch;  // Current job id in batch
-
-  // taskDim is not always job num, because of task dim x limit
-  if (batch_idx >= B) {
-    return true;
-  }
-
-  // Compute s and t block id in batch
-  int s_block_id, t_block_id;
-  s_block_id = __mluop_max(0, step_i - (t_block_num - 1)) + block_id_in_batch;
-  t_block_id = __mluop_min(step_i, t_block_num - 1) - block_id_in_batch;
-
-  // Compute current job id scope
-  cur_s_begin = s_block_id * s_block_size;
-  cur_t_begin = t_block_id * t_block_size;
-  cur_s_end = (s_block_id + 1) * s_block_size - 1;
-  cur_t_end = (t_block_id + 1) * t_block_size - 1;
-
-  // Deal with boundary and decide current job if need to compute
-  if (has_boundary) {
-    int64_t *boundary = (int64_t *)nram_buffer;
-    __memcpy(boundary, opt_boundary + 4 * batch_idx, 4 * sizeof(int64_t),
-             GDRAM2NRAM);
-    batch_s_begin = boundary[0];
-    batch_t_begin = boundary[1];
-    batch_s_end = boundary[2];
-    batch_t_end = boundary[3];
-    // invalid boundary, first launch step set ans to 0
-    if (step_i == 0 &&
-        (batch_s_begin > batch_s_end || batch_t_begin > batch_t_end)) {
-      ans[batch_idx] = 0;
-      return true;
-    }
-  }
-
-  // Compare current job scope with batch scope, if empty job, return
-  if (cur_s_begin > batch_s_end || cur_t_begin > batch_t_end ||
-      cur_s_end < batch_s_begin || cur_t_end < batch_t_begin) {
-    return true;
-  }
-
-  // Reset s and t begin and end to valid boundary
-  if (cur_s_begin < batch_s_begin) {
-    cur_s_begin = batch_s_begin;
-  }
-  if (cur_t_begin < batch_t_begin) {
-    cur_t_begin = batch_t_begin;
-  }
-  if (cur_s_end > batch_s_end) {
-    cur_s_end = batch_s_end;
-  }
-  if (cur_t_end > batch_t_end) {
-    cur_t_end = batch_t_end;
-  }
-
-  cur_s_size = cur_s_end - cur_s_begin + 1;
-  cur_t_size = cur_t_end - cur_t_begin + 1;
-
-  // At last compute step, need compute ans
-  if (cur_s_end == batch_s_end && cur_t_end == batch_t_end) {
-    need_compute_ans = true;
-  } else {
-    need_compute_ans = false;
-  }
-
-  return false;
-}
-
-__mlu_func__ void loadInitP(const float *gdram_px, const float *gdram_py,
-                            const float *gdram_p, float *nram_px,
-                            float *nram_py, float *nram_p, const int S,
-                            const int T, const int batch_s_begin,
-                            const int batch_t_begin, const int cur_s_begin,
-                            const int cur_t_begin, const int cur_s_size,
-                            const int cur_t_size) {
-  // Compare current s_begin and batch_s_begin to decide load px or write -inf
-  if (cur_s_begin > batch_s_begin) {
-    // Load px(s-1, t)
-    __memcpy_async(
-        nram_px, gdram_px + (cur_s_begin - 1) * (T + 1) + cur_t_begin,
-        cur_t_size * sizeof(float), GDRAM2NRAM, cur_t_size * sizeof(float),
-        (T + 1) * sizeof(float), cur_s_size - 1);
-    // Load p(s-1, t), one row
-    __memcpy_async(nram_p + 1,
-                   gdram_p + (cur_s_begin - 1) * (T + 1) + cur_t_begin,
-                   cur_t_size * sizeof(float), GDRAM2NRAM, 0, 0, 0);
-  } else {  // cur_s_begin == batch_s_begin, skip first row, and write -inf
-    if (cur_s_size > 1) {
-      __memcpy_async(
-          nram_px + cur_t_size, gdram_px + cur_s_begin * (T + 1) + cur_t_begin,
-          cur_t_size * sizeof(float), GDRAM2NRAM, cur_t_size * sizeof(float),
-          (T + 1) * sizeof(float), cur_s_size - 2);
-    }
-    __nramset_async(nram_px, cur_t_size, (float)(-INFINITY), 0, 0);
-    // p(s-1, t) first row write -inf
-    __nramset_async(nram_p + 1, cur_t_size, (float)(-INFINITY), 0, 0);
-  }
-
-  // Compare current t_begin and batch_t_begin to decide load py or write -inf
-  if (cur_t_begin > batch_t_begin) {
-    // Load py(s, t-1)
-    __memcpy_async(nram_py, gdram_py + cur_s_begin * T + cur_t_begin - 1,
-                   cur_t_size * sizeof(float), GDRAM2NRAM,
-                   cur_t_size * sizeof(float), T * sizeof(float),
-                   cur_s_size - 1);
-    // Load p(s, t-1)
-    __memcpy_async(nram_p + cur_t_size + 1,
-                   gdram_p + cur_s_begin * (T + 1) + cur_t_begin - 1,
-                   sizeof(float), GDRAM2NRAM, (cur_t_size + 1) * sizeof(float),
-                   (T + 1) * sizeof(float), cur_s_size - 1);
-  } else {  // cur_t_begin == batch_t_begin, skip first column, and write -inf
-    if (cur_t_size > 1) {
-      __memcpy_async(nram_py + 1, gdram_py + cur_s_begin * T + cur_t_begin,
-                     (cur_t_size - 1) * sizeof(float), GDRAM2NRAM,
-                     cur_t_size * sizeof(float), T * sizeof(float),
-                     cur_s_size - 1);
-    }
-    __nramset_async(nram_py, 1, (float)(-INFINITY), cur_t_size * sizeof(float),
-                    cur_s_size - 1);
-    // p(s, t-1) first column write -inf
-    __nramset_async(nram_p + cur_t_size + 1, 1, (float)(-INFINITY),
-                    (cur_t_size + 1) * sizeof(float), cur_s_size - 1);
-  }
-
-  // sync for memcpy async
-  __sync();
-}
-
-__mlu_func__ void computePByDiagonal(
-    float *nram_px, float *nram_py, float *nram_p, float *nram_cur_px,
-    float *nram_cur_py, float *nram_cur_p, float *max_val, float *mask,
-    float *temp, const int batch_s_begin, const int batch_t_begin,
-    const int cur_s_begin, const int cur_t_begin, const int cur_s_size,
-    const int cur_t_size) {
-  // Compute P by diagonal
-  const int repeat = cur_s_size + cur_t_size - 1;
-  const int max_s_t = __mluop_max(cur_s_size, cur_t_size);
-  const int min_s_t = __mluop_min(cur_s_size, cur_t_size);
-
-  for (int i = 0; i < repeat; ++i) {
-    // Initialize p(batch_s_begin, batch_t_begin) to 0
-    if (cur_s_begin == batch_s_begin && cur_t_begin == batch_t_begin &&
-        i == 0) {
-      nram_p[cur_t_size + 2] = 0.0;
-      continue;
-    }
-
-    int data_num = i < max_s_t ? __mluop_min(i + 1, min_s_t)
-                               : cur_s_size + cur_t_size - i - 1;
-
-    // px, py use same s, t index on nram,
-    // different -1 offset of row and column is considered when load
-    int first_s = __mluop_max(0, i - (cur_t_size - 1));
-    int first_t = __mluop_min(i, cur_t_size - 1);
-
-    // Move p(s-1, t), p(s, t-1), px(s-1, t), py(s, t-1)
-    __memcpy(nram_cur_p, nram_p + first_s * (cur_t_size + 1) + first_t + 1,
-             sizeof(float), NRAM2NRAM, sizeof(float),
-             cur_t_size * sizeof(float), data_num);
-    __memcpy(nram_cur_px, nram_px + first_s * cur_t_size + first_t,
-             sizeof(float), NRAM2NRAM, sizeof(float),
-             (cur_t_size - 1) * sizeof(float), data_num - 1);
-    __memcpy(nram_cur_py, nram_py + first_s * cur_t_size + first_t,
-             sizeof(float), NRAM2NRAM, sizeof(float),
-             (cur_t_size - 1) * sizeof(float), data_num - 1);
-
-    // Compute current p
-    __bang_add(nram_cur_px, nram_cur_px, nram_cur_p, data_num);
-    __bang_add(nram_cur_py, nram_cur_py, nram_cur_p + 1, data_num);
-    logAddVector(nram_cur_p, nram_cur_px, nram_cur_py, max_val, mask, temp,
-                 data_num);
-
-    // Move p back
-    __memcpy(nram_p + (first_s + 1) * (cur_t_size + 1) + first_t + 1,
-             nram_cur_p, sizeof(float), NRAM2NRAM, cur_t_size * sizeof(float),
-             sizeof(float), data_num - 1);
-  }
-}
-
-__mlu_global__ void mluBlockDefaultMutualInformationForward(
-    const int B, const int S, const int T, const int step_i,
-    const int job_num_on_step, const int s_block_num, const int t_block_num,
-    const int s_block_size, const int t_block_size, const float *px,
-    const float *py, const bool has_boundary, const int64_t *opt_boundary,
-    float *p, float *ans) {
-  /************************* NRAM SPACE *******************************/
-  /*|----------------------------------------------------------------|*/
-  /*| px, py |     p     |max_val,mask,temp|cur_px |cur_py |  cur_p  |*/
-  /*| 2*S*T  |(S+1)*(T+1)|   3 * min_len   |min_len|min_len|min_len+1|*/
-  /*|----------------------------------------------------------------|*/
-
-  // NOTE: s and t block size has already + 1 on S and T
-  int min_s_t_block_size = __mluop_min(s_block_size, t_block_size);
-  float *nram_px = (float *)nram_buffer;
-  float *nram_py = nram_px + s_block_size * t_block_size;
-  float *nram_p = nram_py + s_block_size * t_block_size;
-  float *nram_max_val = nram_p + (s_block_size + 1) * (t_block_size + 1);
-  float *nram_mask = nram_max_val + min_s_t_block_size;
-  float *nram_temp = nram_mask + min_s_t_block_size;
-  float *nram_cur_px = nram_temp + min_s_t_block_size;
-  float *nram_cur_py = nram_cur_px + min_s_t_block_size;
-  float *nram_cur_p = nram_cur_py + min_s_t_block_size;
-
-  int batch_idx;
-  int batch_s_begin = 0;
-  int batch_t_begin = 0;
-  int batch_s_end = S;
-  int batch_t_end = T;
-  int cur_s_begin, cur_t_begin, cur_s_end, cur_t_end, cur_s_size, cur_t_size;
-  bool need_compute_ans;
-
-  // According to has_boundary, calculate current job scope
-  bool need_return = calPartitionJobScope(
-      has_boundary, opt_boundary, B, S, T, step_i, job_num_on_step, s_block_num,
-      t_block_num, s_block_size, t_block_size, batch_idx, batch_s_begin,
-      batch_t_begin, batch_s_end, batch_t_end, cur_s_begin, cur_t_begin,
-      cur_s_end, cur_t_end, cur_s_size, cur_t_size, need_compute_ans, ans);
-  // Because taskDimX could change to taskDimY, so not all jobs need to compute
-  if (need_return) {
-    return;
-  }
-
-  const int px_one_batch_num = S * (T + 1);
-  const int py_one_batch_num = (S + 1) * T;
-  const int p_one_batch_num = (S + 1) * (T + 1);
-
-  const float *gdram_px = px + batch_idx * px_one_batch_num;
-  const float *gdram_py = py + batch_idx * py_one_batch_num;
-  float *gdram_p = p + batch_idx * p_one_batch_num;
-
-  // LoadInitP, load px, py, other block p, or write -inf at first row/column
-  loadInitP(gdram_px, gdram_py, gdram_p, nram_px, nram_py, nram_p, S, T,
-            batch_s_begin, batch_t_begin, cur_s_begin, cur_t_begin, cur_s_size,
-            cur_t_size);
-
-  // ComputeP by diagonal
-  // p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t], p[b,s,t-1] + py[b,s,t-1])
-  computePByDiagonal(nram_px, nram_py, nram_p, nram_cur_px, nram_cur_py,
-                     nram_cur_p, nram_max_val, nram_mask, nram_temp,
-                     batch_s_begin, batch_t_begin, cur_s_begin, cur_t_begin,
-                     cur_s_size, cur_t_size);
-
-  // StoreP
-  __memcpy(gdram_p + cur_s_begin * (T + 1) + cur_t_begin,
-           nram_p + cur_t_size + 2, cur_t_size * sizeof(float), NRAM2GDRAM,
-           (T + 1) * sizeof(float), (cur_t_size + 1) * sizeof(float),
-           cur_s_size - 1);
-
-  // If last compute step, need store p[s_end, t_end] to ans
-  if (need_compute_ans) {
-    ans[batch_idx] = nram_p[cur_s_size * (cur_t_size + 1) + cur_t_size];
-  }
-}
-
-mluOpStatus_t MLUOP_WIN_API kernelDefaultMutualInformationForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, const int B,
-    const int S, const int T, const int step_i, const int job_num_on_step,
-    const int s_block_num, const int t_block_num, const int s_block_size,
-    const int t_block_size, const void *px, const void *py,
-    const bool has_boundary, const void *opt_boundary, void *p, void *ans) {
-  KERNEL_CHECK(
-      mluBlockDefaultMutualInformationForward<<<k_dim, k_type, queue>>>(
-          B, S, T, step_i, job_num_on_step, s_block_num, t_block_num,
-          s_block_size, t_block_size, (float *)px, (float *)py, has_boundary,
-          (int64_t *)opt_boundary, (float *)p, (float *)ans));
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/mutual_information_forward/mutual_information_forward_utils.h b/kernels/mutual_information_forward/mutual_information_forward_utils.h
deleted file mode 100644
index fc743a95b..000000000
--- a/kernels/mutual_information_forward/mutual_information_forward_utils.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_UTILS_H_
-#define KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_UTILS_H_
-
-#include "mlu_op.h"
-
-#define MIN_LOG_DIFF_FLOAT -15.9423847198486328125f
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-__mlu_func__ void logAddVector(float *dst, float *src1, float *src2,
-                               float *max_value, float *mask, float *temp,
-                               int data_num) {
-  __bang_nan_minimum(dst, src1, src2, data_num);
-  __bang_maximum(max_value, src1, src2, data_num);
-
-  // If src1 is nan, then max_value = src1 = nan
-  // use band with exp and mantissa bits, then compare ge with 0x7f800001
-  __asm__ volatile(
-      "fuse.nram.s32 [%[dst]], %[size], [%[src0]],"
-      ".and(%[src1]), .ge(%[src2]), .mul(%[src3]),"
-      ".and([%[src4]]);\n" ::[dst] "r"((int32_t *)temp),
-      [ size ] "r"(data_num), [ src0 ] "r"((int32_t *)src1),
-      [ src1 ] "r"(0x7fffffff), [ src2 ] "r"(0x7f800001), [ src3 ] "r"(-1),
-      [ src4 ] "r"((int32_t *)src1));
-  __bang_add(max_value, max_value, temp, data_num);
-
-  // Compute log sum exp: max_value + log1p(exp(min_value - max_value))
-  __bang_sub(dst, dst, max_value, data_num);  // min_value - max_value
-  __bang_ge_scalar(mask, dst, MIN_LOG_DIFF_FLOAT, data_num);
-  __mluop_exp(dst, dst, nullptr, 0, data_num);
-  __bang_add_scalar(dst, dst, 1.f, data_num);
-  __mluop_log(dst, dst, nullptr, 0, data_num);
-  __bang_add(dst, dst, max_value, data_num);
-
-  // If min_value - max_value < MIN_LOG_DIFF_FLOAT, return the larger one
-  // mask eq with 0x3f800000(float32(1.0)), -> 0xffffffff
-  __asm__ volatile(
-      "fuse.nram.s32 [%[dst]], %[size], [%[src0]],"
-      ".eq(%[src1]), .mul(%[src2]);\n" ::[dst] "r"((int32_t *)mask),
-      [ size ] "r"(data_num), [ src0 ] "r"((int32_t *)mask),
-      [ src1 ] "r"(0x3f800000), [ src2 ] "r"(-1));
-  __bang_band((char *)dst, (char *)dst, (char *)mask, data_num * sizeof(float));
-
-  // Reverse the mask bits, ((int)mask+1)*(-1), 0->-1, -1->0
-  __bang_fusion(FUSION_FAM, (int *)mask, (int *)mask, 1, -1, data_num);
-  __bang_band((char *)max_value, (char *)max_value, (char *)mask,
-              data_num * sizeof(float));
-  __bang_add(dst, dst, max_value, data_num);
-}
-
-#endif  // KERNELS_MUTUAL_INFORMATION_FORWARD_MUTUAL_INFORMATION_FORWARD_UTILS_H_  // NOLINT
diff --git a/kernels/roi_align_backward/roi_align_backward.cpp b/kernels/roi_align_backward/roi_align_backward.cpp
deleted file mode 100644
index eda7a1641..000000000
--- a/kernels/roi_align_backward/roi_align_backward.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpRoiAlignBackward(
-    mluOpHandle_t handle, const float spatial_scale, const int sampling_ratio,
-    const bool aligned, const mluOpTensorDescriptor_t grads_desc,
-    const void *grads, const mluOpTensorDescriptor_t boxes_desc,
-    const void *boxes, const mluOpTensorDescriptor_t grads_image_desc,
-    void *grads_image) {
-  PARAM_CHECK("mluOpRoiAlignBackward", handle != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward", grads_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward", grads != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward", boxes_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward", boxes != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward", grads_image_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward", grads_image != NULL);
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_desc, cnnl_grads_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(boxes_desc, cnnl_boxes_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_image_desc,
-                                               cnnl_grads_image_desc);
-  CHECK_FUNC_RETURN(
-      cnnlRoiAlignBackward(cnnl_handle, spatial_scale, sampling_ratio, aligned,
-                           cnnl_grads_desc, grads, cnnl_boxes_desc, boxes,
-                           cnnl_grads_image_desc, grads_image),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpRoiAlignBackward] Internal error accured in "
-      "mluOpRoiAlignBackward.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_boxes_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_image_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpRoiAlignBackward_v2(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t grads_desc,
-    const void *grads, const mluOpTensorDescriptor_t boxes_desc,
-    const void *boxes, const mluOpTensorDescriptor_t argmax_x_desc,
-    const void *argmax_x, const mluOpTensorDescriptor_t argmax_y_desc,
-    const void *argmax_y, const float spatial_scale, const int sampling_ratio,
-    const bool aligned, const int pool_mode,
-    const mluOpTensorDescriptor_t grads_image_desc, void *grads_image) {
-  PARAM_CHECK("mluOpRoiAlignBackward_v2", handle != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward_v2", grads_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward_v2", grads != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward_v2", boxes_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward_v2", boxes != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward_v2", grads_image_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignBackward_v2", grads_image != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_desc, cnnl_grads_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(boxes_desc, cnnl_boxes_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_image_desc,
-                                               cnnl_grads_image_desc);
-
-  cnnlTensorDescriptor_t cnnl_argmax_x_desc = NULL;
-  cnnlTensorDescriptor_t cnnl_argmax_y_desc = NULL;
-
-  if (pool_mode == 0) {
-    PARAM_CHECK("mluOpRoiAlignBackward_v2", argmax_x_desc != NULL);
-    PARAM_CHECK("mluOpRoiAlignBackward_v2", argmax_x != NULL);
-    PARAM_CHECK("mluOpRoiAlignBackward_v2", argmax_y_desc != NULL);
-    PARAM_CHECK("mluOpRoiAlignBackward_v2", argmax_y != NULL);
-    CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_x_desc, cnnl_argmax_x_desc);
-    CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_y_desc, cnnl_argmax_y_desc);
-  }
-  CHECK_FUNC_RETURN(cnnlRoiAlignBackward_v2(
-                        cnnl_handle, cnnl_grads_desc, grads, cnnl_boxes_desc,
-                        boxes, cnnl_argmax_x_desc, argmax_x, cnnl_argmax_y_desc,
-                        argmax_y, spatial_scale, sampling_ratio, aligned,
-                        pool_mode, cnnl_grads_image_desc, grads_image),
-                    CNNL_STATUS_SUCCESS,
-                    "[mluOpRoiAlignBackward_v2] Internal error accured in "
-                    "mluOpRoiAlignBackward_v2.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_boxes_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_image_desc);
-  if (pool_mode == 0) {
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_x_desc);
-    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_y_desc);
-  }
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
deleted file mode 100644
index 9f5068212..000000000
--- a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetSyncBatchnormBackwardReduceWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_x,
-    size_t *workspace_size) {
-  PARAM_CHECK("mluOpSyncBatchnormBackwardReduce_v2", handle != NULL);
-  PARAM_CHECK("mluOpSyncBatchnormBackwardReduce_v2", desc_x != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_x, cnnl_desc_x);
-
-  CHECK_FUNC_RETURN(
-      cnnlGetSyncBatchnormBackwardReduceWorkspaceSize(cnnl_handle, cnnl_desc_x,
-                                                      workspace_size),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchnormBackwardReduce_v2] Internal error"
-      " accured in mluOpGetSyncBatchnormBackwardReduceWorkspaceSize.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_x);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz,
-    const mluOpTensorDescriptor_t desc_x, const void *x,
-    const mluOpTensorDescriptor_t desc_mean, const void *mean,
-    const mluOpTensorDescriptor_t desc_invstd, const void *invstd,
-    const mluOpTensorDescriptor_t desc_dfilter, void *dfilter,
-    const mluOpTensorDescriptor_t desc_dbias, void *dbias,
-    const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy,
-    const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu,
-    const bool needs_input_grad0, const bool needs_input_grad1,
-    const bool needs_input_grad2) {
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_dz != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_invstd != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", dz != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", invstd != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dz, cnnl_desc_dz);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_x, cnnl_desc_x);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_mean, cnnl_desc_mean);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_invstd, cnnl_desc_invstd);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dfilter, cnnl_desc_dfilter);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dbias, cnnl_desc_dbias);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_sum_dy, cnnl_desc_sum_dy);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_sum_dy_xmu,
-                                               cnnl_desc_sum_dy_xmu);
-
-  CHECK_FUNC_RETURN(
-      cnnlSyncBatchnormBackwardReduce(
-          cnnl_handle, cnnl_desc_dz, dz, cnnl_desc_x, x, cnnl_desc_mean, mean,
-          cnnl_desc_invstd, invstd, cnnl_desc_dfilter, dfilter, cnnl_desc_dbias,
-          dbias, cnnl_desc_sum_dy, sum_dy, cnnl_desc_sum_dy_xmu, sum_dy_xmu,
-          needs_input_grad0, needs_input_grad1, needs_input_grad2),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchnormBackwardReduce] Internal error"
-      " accured in mluOpSyncBatchnormBackwardReduce.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dz);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_x);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_mean);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_invstd);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dfilter);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dbias);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_sum_dy);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_sum_dy_xmu);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchnormBackwardReduce_v2(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t desc_dz, const void *dz,
-    const mluOpTensorDescriptor_t desc_x, const void *x,
-    const mluOpTensorDescriptor_t desc_mean, const void *mean,
-    const mluOpTensorDescriptor_t desc_invstd, const void *invstd,
-    void *workspace, size_t workspace_size,
-    const mluOpTensorDescriptor_t desc_dfilter, void *dfilter,
-    const mluOpTensorDescriptor_t desc_dbias, void *dbias,
-    const mluOpTensorDescriptor_t desc_sum_dy, void *sum_dy,
-    const mluOpTensorDescriptor_t desc_sum_dy_xmu, void *sum_dy_xmu,
-    const bool needs_input_grad0, const bool needs_input_grad1,
-    const bool needs_input_grad2) {
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_dz != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", desc_invstd != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", dz != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchnormBackwardReduce]", invstd != NULL);
-  if (workspace_size > 0) {
-    PARAM_CHECK("mluOpSyncBatchnormBackwardReduce_v2", workspace != NULL);
-  }
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dz, cnnl_desc_dz);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_x, cnnl_desc_x);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_mean, cnnl_desc_mean);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_invstd, cnnl_desc_invstd);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dfilter, cnnl_desc_dfilter);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_dbias, cnnl_desc_dbias);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_sum_dy, cnnl_desc_sum_dy);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(desc_sum_dy_xmu,
-                                               cnnl_desc_sum_dy_xmu);
-
-  CHECK_FUNC_RETURN(
-      cnnlSyncBatchnormBackwardReduce_v2(
-          cnnl_handle, cnnl_desc_dz, dz, cnnl_desc_x, x, cnnl_desc_mean, mean,
-          cnnl_desc_invstd, invstd, workspace, workspace_size,
-          cnnl_desc_dfilter, dfilter, cnnl_desc_dbias, dbias, cnnl_desc_sum_dy,
-          sum_dy, cnnl_desc_sum_dy_xmu, sum_dy_xmu, needs_input_grad0,
-          needs_input_grad1, needs_input_grad2),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchnormBackwardReduce] Internal error"
-      " accured in mluOpSyncBatchnormBackwardReduce_v2.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dz);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_x);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_mean);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_invstd);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dfilter);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_dbias);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_sum_dy);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_desc_sum_dy_xmu);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/roi_pooling_backward/roi_pooling_backward.cpp b/kernels/roi_pooling_backward/roi_pooling_backward.cpp
deleted file mode 100644
index 3fbcf6c57..000000000
--- a/kernels/roi_pooling_backward/roi_pooling_backward.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpRoiPoolingBackward(
-    mluOpHandle_t handle, mluOpPoolingMode_t pooling_mode,
-    const mluOpTensorDescriptor_t grads_desc, const void *grads,
-    const mluOpTensorDescriptor_t rois_desc, const void *rois,
-    const mluOpTensorDescriptor_t argmax_desc, const int *argmax,
-    const float spatial_scale, const mluOpTensorDescriptor_t grads_image_desc,
-    void *grads_image) {
-  PARAM_CHECK("[mluOpRoiPoolingBackward]", handle != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingBackward]", grads_desc != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingBackward]", grads != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingBackward]", rois_desc != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingBackward]", rois != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingBackward]", argmax_desc != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingBackward]", argmax != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingBackward]", grads_image_desc != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingBackward]", grads_image != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_desc, cnnl_grads_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(rois_desc, cnnl_rois_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_desc, cnnl_argmax_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(grads_image_desc,
-                                               cnnl_grads_image_desc);
-
-  CHECK_FUNC_RETURN(
-      cnnlRoiPoolingBackward(cnnl_handle, cnnlPoolingMode_t(pooling_mode),
-                             cnnl_grads_desc, grads, cnnl_rois_desc, rois,
-                             cnnl_argmax_desc, argmax, spatial_scale,
-                             cnnl_grads_image_desc, grads_image),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpRoiPoolingBackward] Internal error"
-      " accured in mluOpRoiPoolingBackward.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_rois_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_grads_image_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/roi_pooling_forward/roi_pooling_forward.cpp b/kernels/roi_pooling_forward/roi_pooling_forward.cpp
deleted file mode 100644
index 389842d64..000000000
--- a/kernels/roi_pooling_forward/roi_pooling_forward.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpRoiPoolingForward(
-    mluOpHandle_t handle, mluOpPoolingMode_t pooling_mode,
-    const mluOpTensorDescriptor_t input_desc, const void *input,
-    const mluOpTensorDescriptor_t rois_desc, const void *rois,
-    float spatial_scale, const mluOpTensorDescriptor_t output_desc,
-    void *output, int *argmax) {
-  PARAM_CHECK("[mluOpRoiPoolingForward]", handle != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingForward]", input_desc != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingForward]", input != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingForward]", rois_desc != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingForward]", rois != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingForward]", output_desc != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingForward]", output != NULL);
-  PARAM_CHECK("[mluOpRoiPoolingForward]", argmax != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(rois_desc, cnnl_rois_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc);
-
-  CHECK_FUNC_RETURN(
-      cnnlRoiPoolingForward(cnnl_handle, cnnlPoolingMode_t(pooling_mode),
-                            cnnl_input_desc, input, cnnl_rois_desc, rois,
-                            spatial_scale, cnnl_output_desc, output, argmax),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpRoiPoolingForward] Internal error"
-      " accured in mluOpRoiPoolingForward.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_rois_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/roialign_forward/roialign_forward.cpp b/kernels/roialign_forward/roialign_forward.cpp
deleted file mode 100644
index 6e42efde7..000000000
--- a/kernels/roialign_forward/roialign_forward.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2022] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API
-mluOpCreateRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t *desc) {
-  PARAM_CHECK("[mluOpRoiAlignForward_v2]", desc != NULL);
-  CHECK_FUNC_RETURN(cnnlCreateRoiAlignDescriptor(desc), CNNL_STATUS_SUCCESS,
-                    "[mluOpRoiAlignForward_v2] Internal error accured in "
-                    "mluOpCreateRoiAlignForwardDescriptor.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API
-mluOpDestroyRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t desc) {
-  PARAM_CHECK("[mluOpRoiAlignForward_v2]", desc != NULL);
-  CHECK_FUNC_RETURN(cnnlDestroyRoiAlignDescriptor(desc), CNNL_STATUS_SUCCESS,
-                    "[mluOpRoiAlignForward_v2] Internal error accured in "
-                    "mluOpDestroyRoiAlignForwardDescriptor.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpSetRoiAlignForwardDescriptor_v2(
-    mluOpRoiAlignForwardDescriptor_t desc, const int pooled_height,
-    const int pooled_width, const int sampling_ratio, const float spatial_scale,
-    const int pool_mode, const bool aligned) {
-  PARAM_CHECK("[mluOpRoiAlignForward_v2]", desc != NULL);
-  CHECK_FUNC_RETURN(cnnlSetRoiAlignDescriptor_v2(
-                        desc, pooled_height, pooled_width, sampling_ratio,
-                        spatial_scale, pool_mode, aligned),
-                    CNNL_STATUS_SUCCESS,
-                    "[mluOpRoiAlignForward_v2] Internal error accured in "
-                    "mluOpSetRoiAlignForwardDescriptor_v2.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpRoiAlignForward_v2(
-    mluOpHandle_t handle, const mluOpRoiAlignForwardDescriptor_t roialign_desc,
-    const mluOpTensorDescriptor_t input_desc, const void *input,
-    const mluOpTensorDescriptor_t boxes_desc, const void *boxes,
-    const mluOpTensorDescriptor_t output_desc, void *output,
-    const mluOpTensorDescriptor_t argmax_x_desc, void *argmax_x,
-    const mluOpTensorDescriptor_t argmax_y_desc, void *argmax_y) {
-  PARAM_CHECK("mluOpRoiAlignForward_v2", handle != NULL);
-  PARAM_CHECK("mluOpRoiAlignForward_v2", roialign_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignForward_v2", input_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignForward_v2", boxes_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignForward_v2", output_desc != NULL);
-  PARAM_CHECK("mluOpRoiAlignForward_v2", input != NULL);
-  PARAM_CHECK("mluOpRoiAlignForward_v2", boxes != NULL);
-  PARAM_CHECK("mluOpRoiAlignForward_v2", output != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(boxes_desc, cnnl_boxes_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(output_desc, cnnl_output_desc);
-
-  cnnlTensorDescriptor_t cnnl_argmax_x_desc = NULL;
-  cnnlTensorDescriptor_t cnnl_argmax_y_desc = NULL;
-  CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_x_desc, cnnl_argmax_x_desc);
-  CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(argmax_y_desc, cnnl_argmax_y_desc);
-  CHECK_FUNC_RETURN(
-      cnnlRoiAlign_v2(cnnl_handle, roialign_desc, cnnl_input_desc, input,
-                      cnnl_boxes_desc, boxes, cnnl_output_desc, output,
-                      cnnl_argmax_x_desc, argmax_x, cnnl_argmax_y_desc,
-                      argmax_y),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpRoiAlignForward_v2] Internal error"
-      " accured in mluOpRoiAlignForward_v2.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_boxes_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_x_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_argmax_y_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp b/kernels/sync_batch_norm/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
similarity index 100%
rename from kernels/roi_pooling/sync_batchnorm/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
rename to kernels/sync_batch_norm/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp b/kernels/sync_batch_norm/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
similarity index 100%
rename from kernels/roi_pooling/sync_batchnorm/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
rename to kernels/sync_batch_norm/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
diff --git a/kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp b/kernels/sync_batch_norm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
similarity index 100%
rename from kernels/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
rename to kernels/sync_batch_norm/sync_batchnorm_backward_reduce/sync_batchnorm_backward_reduce.cpp
diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp b/kernels/sync_batch_norm/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
similarity index 100%
rename from kernels/roi_pooling/sync_batchnorm/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
rename to kernels/sync_batch_norm/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp b/kernels/sync_batch_norm/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
similarity index 100%
rename from kernels/roi_pooling/sync_batchnorm/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
rename to kernels/sync_batch_norm/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
diff --git a/kernels/roi_pooling/sync_batchnorm/sync_batchnorm_stats/sync_batchnorm_stats.cpp b/kernels/sync_batch_norm/sync_batchnorm_stats/sync_batchnorm_stats.cpp
similarity index 100%
rename from kernels/roi_pooling/sync_batchnorm/sync_batchnorm_stats/sync_batchnorm_stats.cpp
rename to kernels/sync_batch_norm/sync_batchnorm_stats/sync_batchnorm_stats.cpp
diff --git a/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp b/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
deleted file mode 100644
index 80e6e829c..000000000
--- a/kernels/sync_batch_norm_backward_elemt/sync_batch_norm_backward_elemt.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardElemt(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t diff_y_desc,
-    const void *diff_y, const mluOpTensorDescriptor_t x_desc, const void *x,
-    const mluOpTensorDescriptor_t mean_desc, const void *mean,
-    const mluOpTensorDescriptor_t invstd_desc, const void *invstd,
-    const mluOpTensorDescriptor_t filter_desc, const void *filter,
-    const mluOpTensorDescriptor_t mean_dy_desc, const void *mean_dy,
-    const mluOpTensorDescriptor_t mean_dy_xmu_desc, const void *mean_dy_xmu,
-    const mluOpTensorDescriptor_t diffcnnl_x_desc, void *diff_x) {
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", diff_y_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", x_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", invstd_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_dy_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_dy_xmu_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", diffcnnl_x_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", diff_y != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", invstd != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_dy != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", mean_dy_xmu != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemt]", diff_x != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(diff_y_desc, cnnl_diff_y_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_dy_desc, cnnl_mean_dy_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_dy_xmu_desc,
-                                               cnnl_mean_dy_xmu_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(diffcnnl_x_desc,
-                                               cnnl_diffcnnl_x_desc);
-
-  CHECK_FUNC_RETURN(
-      cnnlSyncBatchNormBackwardElemt(
-          cnnl_handle, cnnl_diff_y_desc, diff_y, cnnl_x_desc, x, cnnl_mean_desc,
-          mean, cnnl_invstd_desc, invstd, cnnl_filter_desc, filter,
-          cnnl_mean_dy_desc, mean_dy, cnnl_mean_dy_xmu_desc, mean_dy_xmu,
-          cnnl_diffcnnl_x_desc, diff_x),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchNormBackwardElemt] Internal error"
-      " accured in mluOpSyncBatchNormBackwardElemt.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_diff_y_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_dy_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_dy_xmu_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_diffcnnl_x_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp b/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
deleted file mode 100644
index e7ce0d9b6..000000000
--- a/kernels/sync_batchnorm_backward_elemt_v2/sync_batchnorm_backward_elemt_v2.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormBackwardElemtV2(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t diff_y_desc,
-    const void *diff_y, const mluOpTensorDescriptor_t x_desc, const void *x,
-    const mluOpTensorDescriptor_t mean_desc, const void *mean,
-    const mluOpTensorDescriptor_t invstd_desc, const void *invstd,
-    const mluOpTensorDescriptor_t filter_desc, const void *filter,
-    const mluOpTensorDescriptor_t sum_dy_desc, const void *sum_dy,
-    const mluOpTensorDescriptor_t sum_dy_xmu_desc, const void *sum_dy_xmu,
-    const mluOpTensorDescriptor_t count_desc, const void *count,
-    const mluOpTensorDescriptor_t diffcnnl_x_desc, void *diff_x) {
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", diff_y_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", x_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", mean_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", invstd_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", sum_dy_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", sum_dy_xmu_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", count_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", diffcnnl_x_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", diff_y != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", invstd != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", sum_dy != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", sum_dy_xmu != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", count != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormBackwardElemtV2]", diff_x != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(diff_y_desc, cnnl_diff_y_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sum_dy_desc, cnnl_sum_dy_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(sum_dy_xmu_desc,
-                                               cnnl_sum_dy_xmu_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(count_desc, cnnl_count_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(diffcnnl_x_desc,
-                                               cnnl_diffcnnl_x_desc);
-
-  CHECK_FUNC_RETURN(
-      cnnlSyncBatchNormBackwardElemtV2(
-          cnnl_handle, cnnl_diff_y_desc, diff_y, cnnl_x_desc, x, cnnl_mean_desc,
-          mean, cnnl_invstd_desc, invstd, cnnl_filter_desc, filter,
-          cnnl_sum_dy_desc, sum_dy, cnnl_sum_dy_xmu_desc, sum_dy_xmu,
-          cnnl_count_desc, count, cnnl_diffcnnl_x_desc, diff_x),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchNormBackwardElemtV2] Internal error"
-      " accured in mluOpSyncBatchNormBackwardElemtV2.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_diff_y_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_sum_dy_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_sum_dy_xmu_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_count_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_diffcnnl_x_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp b/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
deleted file mode 100644
index 54d23d574..000000000
--- a/kernels/sync_batchnorm_elemt/sync_batchnorm_elemt.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormElemt(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x,
-    const mluOpTensorDescriptor_t mean_desc, const void *mean,
-    const mluOpTensorDescriptor_t invstd_desc, const void *invstd,
-    const mluOpTensorDescriptor_t filter_desc, const void *filter,
-    const mluOpTensorDescriptor_t bias_desc, const void *bias,
-    const mluOpTensorDescriptor_t y_desc, void *y) {
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]", x_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]", mean_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]", invstd_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]",
-              (filter_desc != NULL && bias_desc != NULL) ||
-                  (filter_desc == NULL && bias_desc == NULL));
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]", y_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]", x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]", mean != NULL);
-  PARAM_CHECK(
-      "[mluOpSyncBatchNormElemt]",
-      (filter != NULL && bias != NULL) || (filter == NULL && bias == NULL));
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]", invstd != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormElemt]", y != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(filter_desc, cnnl_filter_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(bias_desc, cnnl_bias_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(y_desc, cnnl_y_desc);
-
-  CHECK_FUNC_RETURN(
-      cnnlSyncBatchNormElemt(cnnl_handle, cnnl_x_desc, x, cnnl_mean_desc, mean,
-                             cnnl_invstd_desc, invstd, cnnl_filter_desc, filter,
-                             cnnl_bias_desc, bias, cnnl_y_desc, y),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchNormElemt] Internal error"
-      " accured in mluOpSyncBatchNormElemt.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_filter_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_bias_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_y_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp b/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
deleted file mode 100644
index e892d85b5..000000000
--- a/kernels/sync_batchnorm_gather_stats_with_counts/sync_batchnorm_gather_stats_with_counts.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormGatherStatsWithCounts(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t mean_all_desc,
-    const void *mean_all, const mluOpTensorDescriptor_t invstd_all_desc,
-    const void *invstd_all, const mluOpTensorDescriptor_t movingcnnl_mean_desc,
-    void *moving_mean, const mluOpTensorDescriptor_t moving_var_desc,
-    void *moving_var, float momentum, float eps,
-    const mluOpTensorDescriptor_t count_all_desc, const void *count_all,
-    const mluOpTensorDescriptor_t mean_desc, void *mean,
-    const mluOpTensorDescriptor_t invstd_desc, void *invstd) {
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]",
-              mean_all_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]",
-              invstd_all_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]",
-              count_all_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", mean_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", invstd_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]",
-              (movingcnnl_mean_desc != NULL && moving_var_desc != NULL) ||
-                  (movingcnnl_mean_desc == NULL && moving_var_desc == NULL));
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", mean_all != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", invstd_all != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]",
-              (moving_mean != NULL && moving_var != NULL) ||
-                  (moving_mean == NULL && moving_var == NULL));
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", count_all != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormGatherStatsWithCounts]", invstd != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_all_desc,
-                                               cnnl_mean_all_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_all_desc,
-                                               cnnl_invstd_all_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(movingcnnl_mean_desc,
-                                               cnnl_movingcnnl_mean_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(moving_var_desc,
-                                               cnnl_moving_var_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(count_all_desc,
-                                               cnnl_count_all_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc);
-
-  CHECK_FUNC_RETURN(
-      cnnlSyncBatchNormGatherStatsWithCounts(
-          cnnl_handle, cnnl_mean_all_desc, mean_all, cnnl_invstd_all_desc,
-          invstd_all, cnnl_movingcnnl_mean_desc, moving_mean,
-          cnnl_moving_var_desc, moving_var, momentum, eps, cnnl_count_all_desc,
-          count_all, cnnl_mean_desc, mean, cnnl_invstd_desc, invstd),
-      CNNL_STATUS_SUCCESS,
-      "[mluOpSyncBatchNormGatherStatsWithCounts] Internal error"
-      " accured in mluOpSyncBatchNormGatherStatsWithCounts.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_all_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_all_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_movingcnnl_mean_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_moving_var_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_count_all_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp b/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp
deleted file mode 100644
index 35d53cf85..000000000
--- a/kernels/sync_batchnorm_stats/sync_batchnorm_stats.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*************************************************************************
- * Copyright (C) [2023] by Cambricon, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "kernels/utils/cnnl_helper.h"
-
-mluOpStatus_t MLUOP_WIN_API mluOpGetSyncBatchNormStatsWorkspaceSize(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc,
-    size_t *workspace_size) {
-  PARAM_CHECK("mluOpSyncBatchNormStats_v2", handle != NULL);
-  PARAM_CHECK("mluOpSyncBatchNormStats_v2", x_desc != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc);
-
-  CHECK_FUNC_RETURN(cnnlGetSyncBatchNormStatsWorkspaceSize(
-                        cnnl_handle, cnnl_x_desc, workspace_size),
-                    CNNL_STATUS_SUCCESS,
-                    "[mluOpSyncBatchNormStats_v2] Internal error"
-                    " accured in mluOpGetSyncBatchNormStatsWorkspaceSize.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormStats(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x,
-    const float eps, const mluOpTensorDescriptor_t mean_desc, void *mean,
-    const mluOpTensorDescriptor_t invstd_desc, void *invstd) {
-  PARAM_CHECK("[mluOpSyncBatchNormStats]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats]", x_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats]", mean_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats]", invstd_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats]", x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats]", mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats]", invstd != NULL);
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc);
-
-  CHECK_FUNC_RETURN(
-      cnnlSyncBatchNormStats(cnnl_handle, cnnl_x_desc, x, eps, cnnl_mean_desc,
-                             mean, cnnl_invstd_desc, invstd),
-      CNNL_STATUS_SUCCESS,
-      "[cnnlSyncBatchNormStats] Internal error"
-      " accured in cnnlSyncBatchNormStats.",
-      MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
-
-mluOpStatus_t MLUOP_WIN_API mluOpSyncBatchNormStats_v2(
-    mluOpHandle_t handle, const mluOpTensorDescriptor_t x_desc, const void *x,
-    void *workspace, size_t workspace_size, const float eps,
-    const mluOpTensorDescriptor_t mean_desc, void *mean,
-    const mluOpTensorDescriptor_t invstd_desc, void *invstd) {
-  PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", handle != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", x_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", mean_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", invstd_desc != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", x != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", mean != NULL);
-  PARAM_CHECK("[mluOpSyncBatchNormStats_v2]", invstd != NULL);
-  if (workspace_size > 0) {
-    PARAM_CHECK("mluOpSyncBatchNormStats_v2", workspace != NULL);
-  }
-
-  DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(x_desc, cnnl_x_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(mean_desc, cnnl_mean_desc);
-  DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(invstd_desc, cnnl_invstd_desc);
-
-  CHECK_FUNC_RETURN(cnnlSyncBatchNormStats_v2(
-                        cnnl_handle, cnnl_x_desc, x, workspace, workspace_size,
-                        eps, cnnl_mean_desc, mean, cnnl_invstd_desc, invstd),
-                    CNNL_STATUS_SUCCESS,
-                    "[cnnlSyncBatchNormStats_v2] Internal error"
-                    " accured in cnnlSyncBatchNormStats_v2.",
-                    MLUOP_STATUS_INTERNAL_ERROR);
-
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_x_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_mean_desc);
-  DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_invstd_desc);
-  DESTROY_CNNL_HANDLE(cnnl_handle);
-  return MLUOP_STATUS_SUCCESS;
-}
diff --git a/mlu_op.h b/mlu_op.h
index 3f61b45ed..e7331425e 100644
--- a/mlu_op.h
+++ b/mlu_op.h
@@ -1235,7 +1235,7 @@ typedef struct mluOpCarafeStruct *mluOpCarafeDescriptor_t;
 mluOpStatus_t MLUOP_WIN_API
 mluOpCreateTensorDescriptor(mluOpTensorDescriptor_t *desc);
 
-// Group: GetIndicePairs
+// Group:SparseConv
 /*!
  * @brief Creates a tensor descriptor pointed by \b desc that holds the dimensions, pad, stride,
  * dilation, sub_m, transpose, inverse and layout of input filter and output tensor shape.
@@ -1274,7 +1274,7 @@ mluOpCreateTensorDescriptor(mluOpTensorDescriptor_t *desc);
 mluOpStatus_t MLUOP_WIN_API
 mluOpCreateSparseConvolutionDescriptor(mluOpSparseConvolutionDescriptor_t *desc);
 
-// Group: GetIndicePairs
+// Group:SparseConv
 /*!
  * @brief Destroys a convolution descriptor \b desc that was previously created with the
  * ::mluOpCreateSparseConvolutionDescriptor function.
@@ -1517,7 +1517,7 @@ mluOpSetTensorDescriptor_v2(mluOpTensorDescriptor_t desc,
                             int dimNb,
                             const int64_t dimSize[]);
 
-// Group: GetIndicePairs
+// Group:SparseConv
 /*!
  * @brief Initializes the sparse convolution descriptor \b desc that was previously created
  * with ::mluOpCreateSparseConvolutionDescriptor, and sets the information
@@ -1620,7 +1620,7 @@ mluOpSetSparseConvolutionDescriptor(mluOpSparseConvolutionDescriptor_t desc,
                                     const int transpose,
                                     const int inverse);
 
-// Group: GetIndicePairs
+// Group:SparseConv
 /*!
  * @brief Obtains the parameter num_act_out from ::mluOpSparseConvolutionDescriptor_t.
  *
@@ -3487,7 +3487,7 @@ mluOpDiv(mluOpHandle_t handle,
          const mluOpTensorDescriptor_t z_desc,
          void *z);
 
-// Group: DynamicPointToVoxelBackward
+// Group:DynamicPointToVoxel
 /*!
  * @brief Gets extra space size for the DynamicPointToVoxelBackward operation.
  *
@@ -3551,7 +3551,7 @@ mluOpGetDynamicPointToVoxelBackwardWorkspaceSize(const mluOpHandle_t handle,
                                                  const mluOpTensorDescriptor_t voxel_num_desc,
                                                  size_t *workspace_size);
 
-// Group: DynamicPointToVoxelBackward
+// Group:DynamicPointToVoxel
 /*!
  * @brief Performs the back-propagation of DynamicPointToVoxelForward
  * operation to compute the gradient for input \b grad_voxel_feats
@@ -4834,7 +4834,7 @@ mluOpPsRoiPoolBackward(mluOpHandle_t handle,
                        const mluOpTensorDescriptor_t bottom_grad_desc,
                        void *bottom_grad);
 
-// Group: RoiAlignForward
+// Group:RoiAlign
 /*!
  * @brief Creates a descriptor pointed by \b desc for ::mluOpRoiAlignForward_v2,
  * and allocates memory for holding the information about the function.
@@ -4872,7 +4872,7 @@ mluOpPsRoiPoolBackward(mluOpHandle_t handle,
 mluOpStatus_t MLUOP_WIN_API
 mluOpCreateRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t *desc);
 
-// Group: RoiAlignForward
+// Group:RoiAlign
 /*!
  * @brief Initializes the descriptor \b desc that was previously created with
  * ::mluOpCreateRoiAlignForwardDescriptor function, and sets RoiAlign information
@@ -4932,7 +4932,7 @@ mluOpSetRoiAlignForwardDescriptor_v2(mluOpRoiAlignForwardDescriptor_t roialign_d
                                      const int pool_mode,
                                      const bool aligned);
 
-// Group: RoiAlignForward
+// Group:RoiAlign
 /*!
  * @brief Destroys a RoiAlign descriptor \b desc that was previously created
  * with ::mluOpCreateRoiAlignForwardDescriptor function.
@@ -4972,7 +4972,7 @@ mluOpSetRoiAlignForwardDescriptor_v2(mluOpRoiAlignForwardDescriptor_t roialign_d
 mluOpStatus_t MLUOP_WIN_API
 mluOpDestroyRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t desc);
 
-// Group: RoiAlignForward
+// Group:RoiAlign
 /*!
  * @brief Computes the output feature map \b output based on the input feature map \b input
  * and bounding boxes \b boxes to perform this function. This function supports
@@ -6500,7 +6500,7 @@ mluOpBboxOverlaps(mluOpHandle_t handle,
                   const mluOpTensorDescriptor_t ious_desc,
                   void *ious);
 
-// Group: ThreeInterpolate
+// Group:ThreeInterpolate
 /*!
  * @brief Computes weighted linear interpolation on 3 points by using
  * 3 indices in \b indices to select 3 points in \b features, uses the
@@ -6596,7 +6596,7 @@ mluOpThreeInterpolateForward(mluOpHandle_t handle,
                              const mluOpTensorDescriptor_t output_desc,
                              void *output);
 
-// Group: ThreeInterpolate
+// Group:ThreeInterpolate
 /*!
  * @brief Computes the gradients of feature map \b grad_features based on the
  * inputs \b grad_output , \b indices , and \b weights to perform the backpropagation
@@ -6685,7 +6685,7 @@ mluOpThreeInterpolateBackward(mluOpHandle_t handle,
                               const mluOpTensorDescriptor_t grad_features_desc,
                               void *grad_features);
 
-// Group: Ballquery
+// Group:BallQuery
 /*!
  * @brief Takes the point's index in the \b new_xyz set as the center of the sphere,
  * uses \b min_radius and \b max_radius as the radius, and returns the \b idx of
@@ -7173,7 +7173,7 @@ mluOpMaskedIm2colForward(mluOpHandle_t handle,
                          const mluOpTensorDescriptor_t data_col_desc,
                          void *data_col);
 
-// Group: MoeDispatchBackwardData
+// Group:MoeDispatch
 /*!
  * @brief Calculates the inverse gradient of \b input tensor, and returns the results in the output
  * tensor \b grad_input.
@@ -7395,7 +7395,7 @@ mluOpMsDeformAttnBackward(mluOpHandle_t handle,
                           const mluOpTensorDescriptor_t grad_attn_weight_desc,
                           void *grad_attn_weight);
 
-// Group: MutualInformationBackward
+// Group:MutualInformation
 /*!
  * @brief Returns the size of the MLU memory as an extra workspace
  * to optimize ::mluOpMutualInformationBackward.
@@ -7457,7 +7457,7 @@ mluOpGetMutualInformationBackwardWorkspaceSize(mluOpHandle_t handle,
                                                const bool overwrite_ans_grad,
                                                size_t *workspace_size);
 
-// Group: MutualInformationBackward
+// Group:MutualInformation
 /*!
  * @brief Computes the gradients of tensor \b px and tensor \b py.
  *
@@ -7575,7 +7575,7 @@ mluOpMutualInformationBackward(mluOpHandle_t handle,
                                const mluOpTensorDescriptor_t py_grad_desc,
                                void *py_grad);
 
-// Group: MutualInformationForward
+// Group:MutualInformation
 /*!
  * @brief Returns the size of the MLU memory as an extra workspace
  * to optimize ::mluOpMutualInformationForward.
@@ -7634,7 +7634,7 @@ mluOpGetMutualInformationForwardWorkspaceSize(mluOpHandle_t handle,
                                               const mluOpTensorDescriptor_t ans_desc,
                                               size_t *workspace_size);
 
-// Group: MutualInformationForward
+// Group:MutualInformation
 /*!
  * @brief Computes mutual information between tensor \b px and tensor \b py.
  *
@@ -8450,7 +8450,7 @@ mluOpPsamaskBackward(mluOpHandle_t handle,
                      const mluOpTensorDescriptor_t dx_desc,
                      void *dx);
 
-// Group: GetIndicePairs
+// Group:SparseConv
 /*!
  * @brief Computes the get_indice_paris operation, then returns the results in the output
  * tensor \b out_indices , \b indice_pairs and \b ind, ice_num.
@@ -8549,7 +8549,7 @@ mluOpGetIndicePairs(mluOpHandle_t handle,
                     const mluOpTensorDescriptor_t indice_num_desc,
                     void *indice_num);
 
-// Group: GetIndicePairs
+// Group:SparseConv
 /*!
  * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra workspace
  * to optimize the get_indice_pairs operation.
@@ -8620,7 +8620,7 @@ mluOpGetIndicePairsWorkspaceSize(mluOpHandle_t handle,
                                  const mluOpTensorDescriptor_t indice_num_desc,
                                  size_t *workspace_size);
 
-// Group: ActiveRotatedFilterForward
+// Group:ActiveRotatedFilter
 /*!
  * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra
  * workspace to optimize ::mluOpActiveRotatedFilterForward. The size of the extra
@@ -8666,7 +8666,7 @@ mluOpGetActiveRotatedFilterForwardWorkspaceSize(const mluOpHandle_t handle,
                                                 const mluOpTensorDescriptor_t input_desc,
                                                 size_t *workspace_size);
 
-// Group: ActiveRotatedFilterForward
+// Group:ActiveRotatedFilter
 /*!
  * @brief Rotates \b input according to \b indices. This function encodes
  * the orientation information and generates orientation-sensitive features.
@@ -9276,7 +9276,7 @@ mluOpBorderAlignBackward(mluOpHandle_t handle,
                          const mluOpTensorDescriptor_t grad_input_desc,
                          void *grad_input);
 
-// Group: IndiceConvolutionBackwardData
+// Group:SparseConv
 /*!
  * @brief Returns in \b workspace_size the size of the MLU memory that is used as
  * an extra workspace to optimize the indice convolution backward data operation.
@@ -9349,7 +9349,7 @@ mluOpGetIndiceConvolutionBackwardDataWorkspaceSize(mluOpHandle_t handle,
                                                    const int64_t inverse,
                                                    size_t *workspace_size);
 
-// Group: IndiceConvolutionBackwardData
+// Group:SparseConv
 /*!
  * @brief Performs the back propagation of an indice convolution operation to
  * compute the gradient of input \b input_grad based on the gradient of response
@@ -9489,7 +9489,7 @@ mluOpIndiceConvolutionBackwardData(mluOpHandle_t handle,
                                    const mluOpTensorDescriptor_t input_grad_desc,
                                    void *input_grad);
 
-// Group: IndiceConvolutionBackwardFilter
+// Group:SparseConv
 /*!
  * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra workspace
  * to optimize the indice_convolution_backward_filter operation.
@@ -9562,7 +9562,7 @@ mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize(mluOpHandle_t handle,
                                                      const int64_t sub_m,
                                                      size_t *workspace_size);
 
-// Group: IndiceConvolutionBackwardFilter
+// Group:SparseConv
 /*!
  * @brief Computes the indice_convolution_backward_filter operation, then returns the results in the output
  * tensor \b filters_grad.
@@ -9843,7 +9843,7 @@ mluOpRoiPointPool3d(mluOpHandle_t handle,
                     const mluOpTensorDescriptor_t pooled_empty_flag_desc,
                     void *pooled_empty_flag);
 
-// Group: ThreeNNForward
+// Group:ThreeNN
 /*!
  * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra
  * workspace to optimize ::mluOpThreeNNForward. The size of the extra workspace is
@@ -9889,7 +9889,7 @@ mluOpGetThreeNNForwardWorkspaceSize(const mluOpHandle_t handle,
                                     const mluOpTensorDescriptor_t known_desc,
                                     size_t *workspace_size);
 
-// Group: ThreeNNForward
+// Group:ThreeNN
 /*!
  * @brief Finds the closest 3 points of \b unknown among \b known, and outputs \b dist and index
  * \b idx tensor. This function firstly computes dist of each known point to a unknown point, and
@@ -9971,7 +9971,7 @@ mluOpThreeNNForward(const mluOpHandle_t handle,
                     const mluOpTensorDescriptor_t idx_desc,
                     void *idx);
 
-// Group: IndiceConvolutionForward
+// Group:SparseConv
 /*!
  * @brief Returns in \b workspace_size of the MLU memory which is used as an extra workspace
  * to boost up indice_convolution_forward computation.
@@ -10047,7 +10047,7 @@ mluOpGetIndiceConvolutionForwardWorkspaceSize(mluOpHandle_t handle,
                                               const int64_t sub_m,
                                               size_t *workspace_size);
 
-// Group: IndiceConvolutionForward
+// Group:SparseConv
 /*!
  * @brief Performs convolution on input sparse tensor \b features with kernel \b filters,
  * then returns the output sparse tensor \b features_out.
@@ -10164,7 +10164,7 @@ mluOpIndiceConvolutionForward(mluOpHandle_t handle,
                               const mluOpTensorDescriptor_t features_out_desc,
                               void *features_out);
 
-// Group: MoeDispatchForward
+// Group:MoeDispatch
 /*!
  * @brief Dispatches the order of \b input tensor, and returns the
  * results in the output tensor \b dispatch in the MoE algorithm.
@@ -10270,7 +10270,7 @@ mluOpMoeDispatchForward(mluOpHandle_t handle,
                         const mluOpTensorDescriptor_t dispatch_desc,
                         void *dispatch);
 
-// Group: MoeDispatchBackwardGate
+// Group:MoeDispatch
 /*!
  * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra workspace
  * to optimize the moe_dispatch_backward_gate operation.
@@ -10317,7 +10317,7 @@ mluOpGetMoeDispatchBackwardGateWorkspaceSize(mluOpHandle_t handle,
                                              const mluOpTensorDescriptor_t input_desc,
                                              size_t *workspace_size);
 
-// Group: MoeDispatchBackwardGate
+// Group:MoeDispatch
 /*!
  * @brief Calculates the inverse gradient of \b gates tensor, and returns the results in the output
  * tensor \b grad_gates.
@@ -10497,7 +10497,7 @@ mluOpPointsInBoxes(mluOpHandle_t handle,
                    const mluOpTensorDescriptor_t points_indices_desc,
                    void *points_indices);
 
-// Group: RoiAlignBackward
+// Group:RoiAlign
 /*!
  * @brief Computes the gradients of images \b grads_image using the gradients \b grads and
  * bounding boxes \b boxes to perform the backpropagation of ::mluOpRoiAlignForward_v2
@@ -10593,7 +10593,7 @@ mluOpRoiAlignBackward(mluOpHandle_t handle,
                       const mluOpTensorDescriptor_t grads_image_desc,
                       void *grads_image);
 
-// Group: RoiAlignBackward
+// Group:RoiAlign
 /*!
  * @brief Computes the gradients of images \b grads_image based on the gradients \b grads,
  * bounding boxes \b boxes, the coordinate of x axis \b argmax_x, and the coordinate of y axis
@@ -10731,7 +10731,7 @@ mluOpRoiAlignBackward_v2(mluOpHandle_t handle,
                          const mluOpTensorDescriptor_t grads_image_desc,
                          void *grads_image);
 
-// Group: MsDeformAttnForward
+// Group:MsDeformAttn
 /*!
  * @brief Implements a multi-scale deformable attention module used in Deformable-Detr.
  * For detailed information about Deformable-Detr, see "Deformable DETR: Deformable
@@ -10938,7 +10938,7 @@ mluOpTinShiftForward(mluOpHandle_t handle,
                      const mluOpTensorDescriptor_t output_desc,
                      void *output);
 
-// Group: MaskedCol2im
+// Group:MaskedIm2Col
 /*!
  * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra workspace to
  * optimize the MaskedCol2imForward operation.
@@ -10993,7 +10993,7 @@ mluOpGetMaskedCol2imForwardWorkspaceSize(mluOpHandle_t handle,
                                          const mluOpTensorDescriptor_t im_desc,
                                          size_t *workspace_size);
 
-// Group: MaskedCol2im
+// Group:MaskedIm2Col
 /*!
  * @brief  Copies the data of the input tensor \b col to the special coordinates by combining \b mask_h_idx tensor
  * and \b mask_w_idx tensor of output tensor \b im.
@@ -11085,7 +11085,7 @@ mluOpMaskedCol2imForward(mluOpHandle_t handle,
                          const mluOpTensorDescriptor_t im_desc,
                          void *im);
 
-// Group: DiffIouRotatedSortVerticesForward
+// Group:DiffIouRotatedSortVertices
 /*!
  * @brief Sorts the effective vertices of the polygon formed by the intersection of two boxes,
  * and outputs the sorted vertex index.
@@ -11157,7 +11157,7 @@ mluOpDiffIouRotatedSortVerticesForward(mluOpHandle_t handle,
                                        const mluOpTensorDescriptor_t idx_desc,
                                        void *idx);
 
-// Group: RoiPoolingForward
+// Group:RoiPooling
 /*!
  * @brief Generates a fixed size feature map and input feature index
  * of argmax for each ROI (Regions of Interest) to perform ::mluOpRoiPoolingForward operation.
@@ -11304,7 +11304,7 @@ mluOpRoiPoolingForward(mluOpHandle_t handle,
                        void *output,
                        int *argmax);
 
-// Group: RoiPoolingBackward
+// Group:RoiPooling
 /*!
  * @brief Computes the gradients of image \b grads_image based on the gradients \b grads and
  * region proposals \b rois to perform the backpropagation of ::mluOpRoiPoolingForward operation.
@@ -11406,7 +11406,7 @@ mluOpRoiPoolingBackward(mluOpHandle_t handle,
                         const mluOpTensorDescriptor_t grads_image_desc,
                         void *grads_image);
 
-// Group: SyncBatchNormStats
+// Group:SyncBatchNorm
 /*!
  * @brief Returns in \b workspace_size the size of the MLU memory that is used as an extra
  * workspace to optimize ::mluOpSyncBatchNormStats_v2 operation.
@@ -11454,7 +11454,7 @@ mluOpGetSyncBatchNormStatsWorkspaceSize(mluOpHandle_t handle,
                                         const mluOpTensorDescriptor_t x_desc,
                                         size_t *workspace_size);
 
-// Group: SyncBatchNormStats
+// Group:SyncBatchNorm
 /*!
  * @brief Computes the local mean and the local inverse standard deviation for each channel
  * across a batch of data in the training scenario.
@@ -11549,7 +11549,7 @@ mluOpSyncBatchNormStats_v2(mluOpHandle_t handle,
                            const mluOpTensorDescriptor_t invstd_desc,
                            void *invstd);
 
-// Group: SyncBatchNormStats
+// Group:SyncBatchNorm
 /*!
  * @brief Computes the local mean and the local inverse standard deviation for each channel
  * across a batch of data in the training scenario.
@@ -11633,7 +11633,7 @@ mluOpSyncBatchNormStats(mluOpHandle_t handle,
                         const mluOpTensorDescriptor_t invstd_desc,
                         void *invstd);
 
-// Group: SyncBatchNormGatherStatsWithCounts
+// Group:SyncBatchNorm
 /*!
  * @brief Computes the global mean and the global inverse standard deviation across aggregation
  * of the local mean and local inverse standard deviation of multiple MLU devices.
@@ -11762,7 +11762,7 @@ mluOpSyncBatchNormGatherStatsWithCounts(mluOpHandle_t handle,
                                         const mluOpTensorDescriptor_t invstd_desc,
                                         void *invstd);
 
-// Group: SyncBatchNormElemt
+// Group:SyncBatchNorm
 /*!
  * @brief Applies Batch Normalization for each channel across a batch of data with the given mean,
  *        inverse variance and scaling factors.
@@ -12641,7 +12641,7 @@ mluOpSyncBatchnormBackwardReduce(mluOpHandle_t handle,
                                  const bool needs_input_grad1,
                                  const bool needs_input_grad2);
 
-// Group: SyncBatchNormBackwardElemt
+// Group:SyncBatchNorm
 /*!
  * @brief Computes the gradients of input in the training scenario.
  *
@@ -12769,7 +12769,7 @@ mluOpSyncBatchNormBackwardElemt(mluOpHandle_t handle,
                                 const mluOpTensorDescriptor_t diff_x_desc,
                                 void *diff_x);
 
-// Group: SyncBatchNormBackwardElemt
+// Group:SyncBatchNorm
 /*!
  * @brief Computes the gradients of input in the training scenario.
  *