From 5445c9def0090738377d72d87cb310bdc949cd48 Mon Sep 17 00:00:00 2001
From: mahxn0 <1262384588@qq.com>
Date: Tue, 3 Dec 2024 18:52:12 +0800
Subject: [PATCH 1/7] [Fix](mlu-ops): modify common func. (#1167)

---
 kernels/kernel.h                                              | 2 +-
 .../ms_deform_attn_backward_fast_union1.mlu                   | 4 ++--
 .../ms_deform_attn_forward/ms_deform_attn_utils.h             | 2 +-
 .../ms_deform_attn_forward/msda_forward_fast_union1.mlu       | 4 ++--
 kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h | 2 +-
 kernels/utils/common.h                                        | 2 +-
 .../pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/kernels/kernel.h b/kernels/kernel.h
index d1a6e96fb..9378f0839 100644
--- a/kernels/kernel.h
+++ b/kernels/kernel.h
@@ -31,7 +31,7 @@
  * Macros for mluop kernels
  ******************************************************************************/
 // in future, can be "__BANG_ARCH__ == 592 || __BANG_ARCH__ == xxx || ...)"
-#define ARCH_SUPPORT_LARGE_TENSOR (__BANG_ARCH__ == 592)
+#define ARCH_SUPPORT_LARGE_TENSOR (__BANG_ARCH__ >= 592)
 
 #define MAX_WRAM_SIZE (__MLU_WRAM_SIZE__ * 1024)
 #define WRAM_LT_STRIDE (__MLU_WRAM_SIZE__ * 1024 / 64)
diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu
index 21ee0b40d..b72087481 100644
--- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu
+++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu
@@ -26,7 +26,7 @@
 
 #include "core/logging.h"
 
-#if (__BANG_ARCH__ == 592)
+#if (__BANG_ARCH__ >= 592)
 
 #define MAX_MEMCPY_SEGNUM (65536)
 #define NRAM_REMAIN_SIZE (48 * 1024)
@@ -454,7 +454,7 @@ __mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardFastKernel(
     const int32_t channels, const int32_t num_levels, const int32_t num_query,
     const int32_t num_points, float* grad_value, float* grad_sampling_loc,
     float* grad_attn_weight) {
-#if (__BANG_ARCH__ == 592)
+#if (__BANG_ARCH__ >= 592)
   using T = float;
   const int32_t num_keys = spatial_size;
   const int32_t input_stride_4 =
diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h
index 122d2d35e..0f4b4dd17 100644
--- a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h
+++ b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h
@@ -376,7 +376,7 @@ __mlu_func__ void stageOneLoop(
 }
 #endif
 
-#if (__BANG_ARCH__ == 592)
+#if (__BANG_ARCH__ >= 592)
 __mlu_func__ void gatherAsync(void* dst, void* src, unsigned int* offset,
                               void* mask, int transfer_size,
                               mluMemcpyDirection_t dir, int dst_stride,
diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu
index b21af0a0e..2d29981e2 100644
--- a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu
+++ b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu
@@ -906,7 +906,7 @@ __mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl(
   }
 }
 
-#if (__BANG_ARCH__ == 592)
+#if (__BANG_ARCH__ >= 592)
 
 /*
   The shape of each tensor on nram:
@@ -1260,7 +1260,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardFast(
   }
 #endif
 
-#if (__BANG_ARCH__ == 592)
+#if (__BANG_ARCH__ >= 592)
   MLUKernelMsDeformAttnForwardFastImpl<float>(
       data_value_gdram, data_spatial_shapes_gdram, data_level_start_index_gdram,
       data_sampling_loc_gdram, data_attn_weight_gdram, batch_size, num_keys,
diff --git a/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h b/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h
index 52a135c7f..259b67e8b 100644
--- a/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h
+++ b/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h
@@ -76,7 +76,7 @@ func: generate stage index from start_index
 */
 __mlu_func__ void stepIndex(int32_t *dst_nram, int32_t start_index,
                             int32_t length) {
-#if (__BANG_ARCH__ == 372 || __BANG_ARCH__ == 322 || __BANG_ARCH__ == 592)
+#if __BANG_ARCH__ >= 372
   int32_t align_num = 128;
   int32_t repeat = (int32_t)(logf(length / align_num) / logf(2));
   int32_t remain = length / align_num - powf(2, repeat);
diff --git a/kernels/utils/common.h b/kernels/utils/common.h
index bceb8ccd4..c6bd1aead 100644
--- a/kernels/utils/common.h
+++ b/kernels/utils/common.h
@@ -419,7 +419,7 @@ __mlu_func__ void __mluop_store_str_3D(T *dst, T *src, int size, int seg_num_in,
  *      dst_nram only support nram.
  * ****************************************************************************/
 __mlu_func__ void __mluop_get_stage_indices_tfuse(int *dst_nram, int length) {
-#if (__BANG_ARCH__ == 372 || __BANG_ARCH__ == 592)
+#if __BANG_ARCH__ >= 372
   int align_num = 128;
   int repeat = (int)(logf(length / align_num) / logf(2));
   int remain = length / align_num - powf(2, repeat);
diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu
index c5b9077e7..fbb93ddb1 100644
--- a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu
+++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu
@@ -24,7 +24,7 @@
 #include "kernels/kernel.h"  // MAX_NRAM_SIZE
 
 __mlu_global__ void flushLLC(void* input, int fill_bytes) {
-#if (__BANG_ARCH__ == 592)
+#if (__BANG_ARCH__ >= 592)
   if (coreId != 0) {
     return;
   }

From 3a674cc184e9ae8fbd010facf8bb8a2aab5557ff Mon Sep 17 00:00:00 2001
From: duzekun <duzekun96@gmail.com>
Date: Wed, 4 Dec 2024 14:49:07 +0800
Subject: [PATCH 2/7] [Docs](mlu-ops): Update version date. (#1171)

Co-authored-by: duzekun <duzekun@cambricon.com>
---
 docs/api_guide/update.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api_guide/update.rst b/docs/api_guide/update.rst
index 1d4b32913..3c60f6b86 100755
--- a/docs/api_guide/update.rst
+++ b/docs/api_guide/update.rst
@@ -5,7 +5,7 @@ This section lists contents that were made for each product release.
 
 * V1.4.0
 
-  **Date:** October 21, 2024
+  **Date:** November 29, 2024
 
   **Changes:**
 

From 59eae84eade1ed8f9928091e8ba5733c59086a84 Mon Sep 17 00:00:00 2001
From: PetrelYy <92866578+PetrelYy@users.noreply.github.com>
Date: Wed, 4 Dec 2024 15:12:40 +0800
Subject: [PATCH 3/7] [Feature](mlu-ops): adapt scatter,gather (#1168)

---
 .../box_iou_rotated/box_iou_rotated_utils.h   | 28 +++---
 .../generate_proposals_v2_union1_500.mlu      |  1 +
 .../ms_deform_attn_backward_fast_union1.mlu   | 28 +++---
 ...rm_attn_backward_small_channels_union1.mlu | 38 ++++----
 .../ms_deform_attn_utils.h                    | 14 +--
 .../msda_forward_fast_union1.mlu              | 50 +++++------
 .../roi_align_rotated_forward_vector.mlu      | 10 +--
 kernels/utils/scatter_gather.h                | 90 +++++++++++++++++++
 .../voxel_pooling_forward_union1.mlu          | 10 ++-
 kernels/voxelization/voxelization_kernel.mlu  | 20 +++--
 10 files changed, 191 insertions(+), 98 deletions(-)
 create mode 100644 kernels/utils/scatter_gather.h

diff --git a/kernels/box_iou_rotated/box_iou_rotated_utils.h b/kernels/box_iou_rotated/box_iou_rotated_utils.h
index 7c3e8d270..22aa3e0ec 100644
--- a/kernels/box_iou_rotated/box_iou_rotated_utils.h
+++ b/kernels/box_iou_rotated/box_iou_rotated_utils.h
@@ -24,6 +24,7 @@
 #define KERNELS_BOX_IOU_ROTATED_BOX_IOU_ROTATED_UTILS_H_
 
 #include "kernels/utils/common.h"
+#include "kernels/utils/scatter_gather.h"
 
 #define FIILED_ONES (int)0xffffffff
 #define HALF_FILLED_ONES (int16_t)0xffff
@@ -590,21 +591,22 @@ __mlu_func__ void convexHullGraham(
                       sizeof(T), actual_compute_box_num);
 
     // get the ordered points according to the angle value
-    __gather(ordered_pts_x + (i + 1) * actual_compute_box_num, intersect_pts_x,
-             (unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T),
-             actual_compute_box_num);
-    __gather(ordered_pts_y + (i + 1) * actual_compute_box_num, intersect_pts_y,
-             (unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T),
-             actual_compute_box_num);
-    __gather(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts,
-             (unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T),
-             actual_compute_box_num);
+    __mluop_gather<T>(ordered_pts_x + (i + 1) * actual_compute_box_num,
+                      intersect_pts_x, (unsigned int *)temp_offset, NULL,
+                      sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
+    __mluop_gather<T>(ordered_pts_y + (i + 1) * actual_compute_box_num,
+                      intersect_pts_y, (unsigned int *)temp_offset, NULL,
+                      sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
+    __mluop_gather<T>(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts,
+                      (unsigned int *)temp_offset, NULL, sizeof(T), NRAM2NRAM,
+                      sizeof(T), actual_compute_box_num);
 
     // assign a invalid value to the point which has been get ordered
-    __scatter(temp_long_2, temp1_ram, (unsigned int *)temp_offset, sizeof(T),
-              NRAM2NRAM, sizeof(T), actual_compute_box_num);
-    __scatter(valid_pts, temp2_ram, (unsigned int *)temp_offset, sizeof(T),
-              NRAM2NRAM, sizeof(T), actual_compute_box_num);
+    __mluop_scatter<T>(temp_long_2, temp1_ram, (unsigned int *)temp_offset,
+                       NULL, sizeof(T), NRAM2NRAM, sizeof(T),
+                       actual_compute_box_num);
+    __mluop_scatter<T>(valid_pts, temp2_ram, (unsigned int *)temp_offset, NULL,
+                       sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
   }
   __bang_move(valid_pts, temp_long_1, total_points * sizeof(T));
 #else
diff --git a/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu b/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu
index bf5887b03..59e25153b 100644
--- a/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu
+++ b/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu
@@ -158,6 +158,7 @@ __mlu_func__ void proposalBoxesDecode(
     // gather offset (byte).
     __bang_mul_scalar(anchors_index_nram, anchors_index_nram, sizeof(int32_t),
                       deal_num);
+    // deal_num <= 5163
     __gather(temp_nram, anchors, (unsigned int *)anchors_index_nram,
              sizeof(T) * 4, GDRAM2NRAM, sizeof(T) * 4, deal_num);
     __bang_transpose(anchors_nram, temp_nram, deal_num, 4);
diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu
index b72087481..d94a2f021 100644
--- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu
+++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu
@@ -233,14 +233,14 @@ __mlu_func__ void backwardStageTwoLoop(
     for (int j = 0; j < 5; j++) {
       T* tmp_wp = weight_polation_nram + (j - 1) * nq_nl_np;
       if (j < 4) {
-        gatherAsync(v_ping, zeros_nram, (unsigned int*)offset_zero_nram_stg2,
-                    bit_cond_reverse_nram + j * bit_cond_stride,
-                    channels * sizeof(T), NRAM2NRAM, channels * sizeof(T),
-                    nq_nl_np);
-        gatherAsync(v_ping, data_value_gdram,
-                    (unsigned int*)offset_nram + j * nq_nl_np,
-                    bit_cond_nram + j * bit_cond_stride, channels * sizeof(T),
-                    GDRAM2NRAM, channels * sizeof(T), nq_nl_np);
+        gatherAsync<T>(v_ping, zeros_nram, (unsigned int*)offset_zero_nram_stg2,
+                       bit_cond_reverse_nram + j * bit_cond_stride,
+                       channels * sizeof(T), NRAM2NRAM, channels * sizeof(T),
+                       nq_nl_np);
+        gatherAsync<T>(
+            v_ping, data_value_gdram, (unsigned int*)offset_nram + j * nq_nl_np,
+            bit_cond_nram + j * bit_cond_stride, channels * sizeof(T),
+            GDRAM2NRAM, channels * sizeof(T), nq_nl_np);
       }
 
       if (j == 0) {
@@ -249,10 +249,10 @@ __mlu_func__ void backwardStageTwoLoop(
                        NRAM2NRAM, channels * sizeof(T), num_levels_points - 1,
                        num_levels_points * channels * sizeof(T), deal_n - 1, 0,
                        num_levels_points - 1, channels * sizeof(T), deal_n - 1);
-        gatherAsync(buffer, zeros_nram, (unsigned int*)offset_zero_nram_stg2,
-                    bit_cond_reverse_nram + 4 * bit_cond_stride,
-                    channels * sizeof(T), NRAM2NRAM, channels * sizeof(T),
-                    nq_nl_np);
+        gatherAsync<T>(buffer, zeros_nram, (unsigned int*)offset_zero_nram_stg2,
+                       bit_cond_reverse_nram + 4 * bit_cond_stride,
+                       channels * sizeof(T), NRAM2NRAM, channels * sizeof(T),
+                       nq_nl_np);
         __bang_write_value(value_wp, nq_nl_np_c, (T)0);  // clear value*wp
         __sync_move();
         // (n, nl, np, c) => (c, n, nl, np)
@@ -332,7 +332,7 @@ __mlu_func__ void backwardStageTwoLoop(
     int32_t* dst_offset = (int32_t*)offset_zero_nram_stg2;
     for (int i = 0; i < 4; i++) {
       __bang_filter((T*)dst_offset + i * nq_nl_np,
-                     (T*)offset_nram + i * nq_nl_np, cond_all_valid, nq_nl_np);
+                    (T*)offset_nram + i * nq_nl_np, cond_all_valid, nq_nl_np);
     }
     int32_t* src_offset = (int32_t*)inter_grad;
     int32_t* stride_4_2 = dst_offset + 3 * nq_nl_np;
@@ -368,7 +368,7 @@ __mlu_func__ void backwardStageTwoLoop(
       int32_t valid_count = __bang_sum(tmp_cond, nq_nl_np);
       if (valid_count > 0) {
         __bang_filter((T*)tmp_dst_offset, (T*)tmp_dst_offset, tmp_cond,
-                       nq_nl_np);
+                      nq_nl_np);
         __bang_filter((T*)tmp_src_offset, (T*)seq_nram, tmp_cond, nq_nl_np);
         __bang_mul_scalar(tmp_src_offset, tmp_src_offset, channels * sizeof(T),
                           valid_count);
diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu
index 517c00a8c..9ff2a72e8 100644
--- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu
+++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu
@@ -25,6 +25,7 @@
 #include "core/logging.h"
 #include "kernels/kernel.h"
 #include "kernels/utils/common.h"
+#include "kernels/utils/scatter_gather.h"
 
 __nram__ int8_t nram_buffer[MAX_NRAM_SIZE];
 
@@ -313,24 +314,25 @@ void __mlu_func__ loadValue(
                     sizeof(int32_t), 4 * num_deal_grid);
   __sync_io_move_compute();
 
-  __gather_async((void *)nram_grad_output_tl, (void *)data_value,
-                 (unsigned int *)grad_temp3, deal_num_real * sizeof(float),
-                 GDRAM2NRAM, deal_num_real * sizeof(float), num_deal_grid);
-
-  __gather_async((void *)nram_grad_output_tr, (void *)data_value,
-                 (unsigned int *)(grad_temp3 + num_deal_grid),
-                 deal_num_real * sizeof(float), GDRAM2NRAM,
-                 deal_num_real * sizeof(float), num_deal_grid);
-
-  __gather_async((void *)nram_grad_output_bl, (void *)data_value,
-                 (unsigned int *)(grad_temp3 + 2 * num_deal_grid),
-                 deal_num_real * sizeof(float), GDRAM2NRAM,
-                 deal_num_real * sizeof(float), num_deal_grid);
-
-  __gather_async((void *)nram_grad_output_br, (void *)data_value,
-                 (unsigned int *)(grad_temp3 + 3 * num_deal_grid),
-                 deal_num_real * sizeof(float), GDRAM2NRAM,
-                 deal_num_real * sizeof(float), num_deal_grid);
+  __mluop_gather<float>((float *)nram_grad_output_tl, (float *)data_value,
+                        (unsigned int *)grad_temp3, NULL,
+                        deal_num_real * sizeof(float), GDRAM2NRAM,
+                        deal_num_real * sizeof(float), num_deal_grid);
+
+  __mluop_gather<float>((float *)nram_grad_output_tr, (float *)data_value,
+                        (unsigned int *)(grad_temp3 + num_deal_grid), NULL,
+                        deal_num_real * sizeof(float), GDRAM2NRAM,
+                        deal_num_real * sizeof(float), num_deal_grid);
+
+  __mluop_gather<float>((float *)nram_grad_output_bl, (float *)data_value,
+                        (unsigned int *)(grad_temp3 + 2 * num_deal_grid), NULL,
+                        deal_num_real * sizeof(float), GDRAM2NRAM,
+                        deal_num_real * sizeof(float), num_deal_grid);
+
+  __mluop_gather<float>((float *)nram_grad_output_br, (float *)data_value,
+                        (unsigned int *)(grad_temp3 + 3 * num_deal_grid), NULL,
+                        deal_num_real * sizeof(float), GDRAM2NRAM,
+                        deal_num_real * sizeof(float), num_deal_grid);
   __sync_io_move_compute();
 
 #else
diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h
index 0f4b4dd17..7ecb0b41f 100644
--- a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h
+++ b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h
@@ -28,6 +28,7 @@
 
 #include "kernels/kernel.h"
 #include "kernels/utils/common.h"
+#include "kernels/utils/scatter_gather.h"
 
 #define BIT_COLLECT_PAD (8)
 #define BACKWARD_MAX_NQ_NL_NP (1024)
@@ -377,19 +378,12 @@ __mlu_func__ void stageOneLoop(
 #endif
 
 #if (__BANG_ARCH__ >= 592)
+template <typename T>
 __mlu_func__ void gatherAsync(void* dst, void* src, unsigned int* offset,
                               void* mask, int transfer_size,
                               mluMemcpyDirection_t dir, int dst_stride,
                               int transfer_num) {
-  __gather_async(dst, src, offset, mask, transfer_size, dir, dst_stride,
-                 transfer_num);
-}
-
-__mlu_func__ void gatherSync(void* dst, void* src, unsigned int* offset,
-                             void* mask, int transfer_size,
-                             mluMemcpyDirection_t dir, int dst_stride,
-                             int transfer_num) {
-  __gather(dst, src, offset, mask, transfer_size, dir, dst_stride,
-           transfer_num);
+  __mluop_gather_async<T>((T*)dst, (T*)src, offset, (uint8_t*)mask,
+                          transfer_size, dir, dst_stride, transfer_num);
 }
 #endif
diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu
index 2d29981e2..a4c61a979 100644
--- a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu
+++ b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu
@@ -290,7 +290,7 @@ __mlu_func__ void getConditionCoordWeight(
   }
   __bang_mul_scalar(buf_nram, weight_attn_nram, (T)1, total_points);
   __bang_filter((float*)weight_attn_nram, (float*)buf_nram,
-                 cond_point_valid_nram, total_points);
+                cond_point_valid_nram, total_points);
   __bang_float2int32((int32_t*)cond_point_polation_nram,
                      cond_point_polation_nram, total_points * 4, 0);
   __bang_mul_scalar((int32_t*)cond_point_polation_nram,
@@ -300,16 +300,16 @@ __mlu_func__ void getConditionCoordWeight(
               (int8_t*)cond_point_polation_nram,
               total_points * 4 * sizeof(float));
   __bang_filter((float*)weight_polation_nram, (float*)weight_polation_nram_tmp,
-                 cond_point_valid_nram, total_points);
+                cond_point_valid_nram, total_points);
   __bang_filter((float*)weight_polation_nram + total_points,
-                 (float*)weight_polation_nram_tmp + total_points,
-                 cond_point_valid_nram, total_points);
+                (float*)weight_polation_nram_tmp + total_points,
+                cond_point_valid_nram, total_points);
   __bang_filter((float*)weight_polation_nram + 2 * total_points,
-                 (float*)weight_polation_nram_tmp + 2 * total_points,
-                 cond_point_valid_nram, total_points);
+                (float*)weight_polation_nram_tmp + 2 * total_points,
+                cond_point_valid_nram, total_points);
   __bang_filter((float*)weight_polation_nram + 3 * total_points,
-                 (float*)weight_polation_nram_tmp + 3 * total_points,
-                 cond_point_valid_nram, total_points);
+                (float*)weight_polation_nram_tmp + 3 * total_points,
+                cond_point_valid_nram, total_points);
   //================================================================================================
   // select cond_point_polation_nram if value_contain_infnan
   if (value_contain_infnan) {
@@ -318,17 +318,17 @@ __mlu_func__ void getConditionCoordWeight(
                       (int32_t*)cond_point_polation_nram, (int32_t)1,
                       total_points * 4);
     __bang_filter((float*)cond_point_polation_nram,
-                   (float*)cond_point_polation_nram_tmp, cond_point_valid_nram,
-                   total_points);
+                  (float*)cond_point_polation_nram_tmp, cond_point_valid_nram,
+                  total_points);
     __bang_filter((float*)cond_point_polation_nram + total_points,
-                   (float*)cond_point_polation_nram_tmp + total_points,
-                   cond_point_valid_nram, total_points);
+                  (float*)cond_point_polation_nram_tmp + total_points,
+                  cond_point_valid_nram, total_points);
     __bang_filter((float*)cond_point_polation_nram + 2 * total_points,
-                   (float*)cond_point_polation_nram_tmp + 2 * total_points,
-                   cond_point_valid_nram, total_points);
+                  (float*)cond_point_polation_nram_tmp + 2 * total_points,
+                  cond_point_valid_nram, total_points);
     __bang_filter((float*)cond_point_polation_nram + 3 * total_points,
-                   (float*)cond_point_polation_nram_tmp + 3 * total_points,
-                   cond_point_valid_nram, total_points);
+                  (float*)cond_point_polation_nram_tmp + 3 * total_points,
+                  cond_point_valid_nram, total_points);
   }
   //================================================================================================
   // compute and select offset and stride
@@ -348,11 +348,11 @@ __mlu_func__ void getConditionCoordWeight(
              (int32_t*)data_offset_nram_tr_tmp,
              (int32_t*)data_offset_nram_tl_tmp, total_points);
   __bang_filter((float*)data_offset_nram_tl, (float*)data_offset_nram_tl_tmp,
-                 cond_point_valid_nram, total_points);
+                cond_point_valid_nram, total_points);
   __bang_filter((float*)data_offset_nram_bl, (float*)data_offset_nram_bl_tmp,
-                 cond_point_valid_nram, total_points);
+                cond_point_valid_nram, total_points);
   __bang_filter((float*)data_offset_nram_tr, (float*)data_offset_nram_tr_tmp,
-                 cond_point_valid_nram, total_points);
+                cond_point_valid_nram, total_points);
 }
 
 /*
@@ -1068,12 +1068,12 @@ __mlu_func__ void forwardStageTwoLoop(
     __sync_io_move_compute();
 
     if (i < loop_num) {
-      gatherAsync(v_load, zeros_nram, (unsigned int*)offset_zero_nram_stg2,
-                  cond_nram_stg2_reverse, channels * sizeof(T), NRAM2NRAM,
-                  channels * sizeof(T), load_point_num);
-      gatherAsync(v_load, data_value_gdram, (unsigned int*)offset_nram_stg2,
-                  cond_nram_stg2, channels * sizeof(T), GDRAM2NRAM,
-                  channels * sizeof(T), load_point_num);
+      gatherAsync<T>(v_load, zeros_nram, (unsigned int*)offset_zero_nram_stg2,
+                     cond_nram_stg2_reverse, channels * sizeof(T), NRAM2NRAM,
+                     channels * sizeof(T), load_point_num);
+      gatherAsync<T>(v_load, data_value_gdram, (unsigned int*)offset_nram_stg2,
+                     cond_nram_stg2, channels * sizeof(T), GDRAM2NRAM,
+                     channels * sizeof(T), load_point_num);
     }
 
     if (i > 0) {
diff --git a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu
index d226df82c..e8c545e04 100644
--- a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu
+++ b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu
@@ -38,14 +38,12 @@ __mlu_func__ void mluopDivScalar(T *dst, T *src, T value, uint32_t num) {
     __asm__ volatile(
         "div.scalar.nram.f16 [%[dst]], [%[src0]], "
         "%[src1], %[num];\n\t" ::[dst] "r"(dst),
-        [ src0 ] "r"(src), [ src1 ] "r"(value),
-        [ num ] "r"(num));
+        [ src0 ] "r"(src), [ src1 ] "r"(value), [ num ] "r"(num));
   } else {
     __asm__ volatile(
         "div.scalar.nram.f32 [%[dst]], [%[src0]], "
         "%[src1], %[num];\n\t" ::[dst] "r"(dst),
-        [ src0 ] "r"(src), [ src1 ] "r"(value),
-        [ num ] "r"(num));
+        [ src0 ] "r"(src), [ src1 ] "r"(value), [ num ] "r"(num));
   }
 }
 
@@ -314,6 +312,7 @@ __mlu_func__ void handleChannels(const T *input, uint32_t deal_channels,
   }
   uint32_t hwc_num = deal_channels * vec_num;
 
+  // vec_num <= 1024
   __gather(val, input, pos, deal_channels * sizeof(T), GDRAM2NRAM,
            deal_channels * sizeof(T), vec_num);
   if (deal_channels != 1) {
@@ -521,8 +520,7 @@ __mlu_global__ void roiAlignRotatedForward(
           }
         }
       }
-      mluopDivScalar(output_channels, output_channels, (T)count,
-                     cur_cache_c);
+      mluopDivScalar(output_channels, output_channels, (T)count, cur_cache_c);
       __memcpy(output_dram + bin_i * channels + c_cache_i, output_channels,
                cur_cache_c * sizeof(T), NRAM2GDRAM);
     }
diff --git a/kernels/utils/scatter_gather.h b/kernels/utils/scatter_gather.h
new file mode 100644
index 000000000..729fd9c9d
--- /dev/null
+++ b/kernels/utils/scatter_gather.h
@@ -0,0 +1,90 @@
+/*************************************************************************
+ * Copyright (C) [2024] by Cambricon, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "kernels/kernel.h"
+
+#define SCATTER_GATHER_PARAMS                                        \
+  T *dst, const T *src, const uint32_t *offset, const uint8_t *mask, \
+      const uint32_t transfer_size, const mluMemcpyDirection_t dir,  \
+      const uint32_t stride, const uint32_t data_num
+
+#if __BANG_ARCH__ > 592
+#define MLUOP_SCATTER_GATHER(func, is_scatter)                               \
+  template <typename T>                                                      \
+  __mlu_func__ void __mluop_##func(SCATTER_GATHER_PARAMS) {                  \
+    if (data_num <= UINT16_MAX) {                                            \
+      if (mask) {                                                            \
+        __##func(dst, src, offset, (const void *)mask, transfer_size, dir,   \
+                 stride, data_num);                                          \
+      } else {                                                               \
+        __##func(dst, src, offset, transfer_size, dir, stride, data_num);    \
+      }                                                                      \
+    } else {                                                                 \
+      uint16_t data_num_new = PAD_DOWN(UINT16_MAX, 64);                      \
+      uint32_t remain = data_num % data_num_new;                             \
+      uint32_t repeat = data_num / data_num_new + uint32_t(remain > 0);      \
+      uint32_t dst_offset = is_scatter ? 0 : data_num_new;                   \
+      uint32_t src_offset = is_scatter ? data_num_new : 0;                   \
+                                                                             \
+      for (uint32_t i = 0; i <= repeat; ++i) {                               \
+        const uint16_t data_num_loop = i < repeat ? data_num_new : remain;   \
+        if (mask) {                                                          \
+          __##func(dst + i * dst_offset, src + i * src_offset,               \
+                   mask + i * (data_num_new / 8), offset + i * data_num_new, \
+                   transfer_size, dir, stride, data_num_loop);               \
+        } else {                                                             \
+          __##func(dst + i * dst_offset, src + i * src_offset,               \
+                   offset + i * data_num_new, transfer_size, dir, stride,    \
+                   data_num_loop);                                           \
+        }                                                                    \
+      }                                                                      \
+    }                                                                        \
+  }
+
+/* __mlu_op_scatter
+ * __mlu_op_scatter_async
+ * __mlu_op_gather
+ * __mlu_op_gather_async
+ */
+MLUOP_SCATTER_GATHER(gather_async, false)
+MLUOP_SCATTER_GATHER(gather, false)
+MLUOP_SCATTER_GATHER(scatter_async, true)
+MLUOP_SCATTER_GATHER(scatter, true)
+
+#elif __BANG_ARCH__ == 592
+#define MLUOP_SCATTER_GATHER(func)                                            \
+  template <typename T>                                                       \
+  __mlu_func__ void __mluop_##func(SCATTER_GATHER_PARAMS) {                    \
+    if (mask) {                                                               \
+      __##func(dst, src, offset, mask, transfer_size, dir, stride, data_num); \
+    } else {                                                                  \
+      __##func(dst, src, offset, transfer_size, dir, stride, data_num);       \
+    }                                                                         \
+  }
+
+MLUOP_SCATTER_GATHER(gather_async)
+MLUOP_SCATTER_GATHER(gather)
+MLUOP_SCATTER_GATHER(scatter_async)
+MLUOP_SCATTER_GATHER(scatter)
+
+#endif  // __BANG_ARCH__ > 592
diff --git a/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu b/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu
index 90ecc8363..a7ff5fbb8 100644
--- a/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu
+++ b/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu
@@ -25,6 +25,7 @@
 #include "core/logging.h"
 #include "kernels/kernel.h"
 #include "kernels/utils/common.h"
+#include "kernels/utils/scatter_gather.h"
 
 __nram__ int8_t nram_buffer[MAX_NRAM_SIZE];
 
@@ -392,10 +393,11 @@ __mlu_func__ void MLUKernelVoxelPoolingStageTwoPerfKernel(
       __bang_ge_bitindex((float *)gather_mask,
                          (float *)nram_geom + point_idx_offset,
                          (float *)nram_geom_x, align_8_deal_num);
-      __gather((float *)gather_src, (float *)input_features,
-               (unsigned int *)gather_offset + point_idx_offset,
-               (void *)gather_mask, num_channels * sizeof(float), GDRAM2NRAM,
-               num_channels * sizeof(float), actual_load_num);
+      __mluop_gather<float>((float *)gather_src, (float *)input_features,
+                            (unsigned int *)gather_offset + point_idx_offset,
+                            (uint8_t *)gather_mask,
+                            num_channels * sizeof(float), GDRAM2NRAM,
+                            num_channels * sizeof(float), actual_load_num);
       for (int index = 0; index < actual_load_num; index++) {
         int output_features_pt_offset = nram_geom[point_idx_offset + index];
         if (output_features_pt_offset >= 0) {
diff --git a/kernels/voxelization/voxelization_kernel.mlu b/kernels/voxelization/voxelization_kernel.mlu
index 04f5580e7..9832ab4bf 100644
--- a/kernels/voxelization/voxelization_kernel.mlu
+++ b/kernels/voxelization/voxelization_kernel.mlu
@@ -28,6 +28,7 @@
 #include "core/logging.h"
 #include "kernels/kernel.h"
 #include "kernels/utils/common.h"
+#include "kernels/utils/scatter_gather.h"
 
 __nram__ int8_t nram_buffer[MAX_NRAM_SIZE];
 
@@ -547,9 +548,10 @@ __mlu_global__ void mluCalcPointsPerVoxel(
         // compute scatter src: voxel_idx
         __bang_add_scalar(nram_temp_mask, nram_base_offset, voxel_num_temp,
                           deal_num);
-        __scatter(nram_scatter_output, nram_temp_mask,
-                  (unsigned int *)nram_scatter_offset, nram_mask_bitindex,
-                  sizeof(int32_t), NRAM2NRAM, sizeof(int32_t), reserve_voxels);
+        __mluop_scatter<int32_t>(nram_scatter_output, nram_temp_mask,
+                                 (unsigned int *)nram_scatter_offset,
+                                 (uint8_t *)nram_mask_bitindex, sizeof(int32_t),
+                                 NRAM2NRAM, sizeof(int32_t), reserve_voxels);
         __memcpy(num_points_per_voxel + voxel_num_temp, nram_scatter_mask,
                  reserve_voxels * sizeof(int32_t), NRAM2GDRAM);
         voxel_num_temp += reserve_voxels;
@@ -568,8 +570,9 @@ __mlu_global__ void mluCalcPointsPerVoxel(
       if (count > 0) {
         __bang_mul_scalar(nram_p2p_idx, nram_p2p_idx, sizeof(int32_t), count);
         // get repeated point real point_id
-        __gather(gather_output, coor_to_voxelidx, (unsigned int *)nram_p2p_idx,
-                 sizeof(int32_t), GDRAM2NRAM, sizeof(int32_t), count);
+        __mluop_gather<int32_t>(
+            gather_output, coor_to_voxelidx, (unsigned int *)nram_p2p_idx, NULL,
+            sizeof(int32_t), GDRAM2NRAM, sizeof(int32_t), count);
         __bang_eq_scalar(nram_scatter_mask, gather_output, -1, count);
         __bang_not(nram_scatter_mask, nram_scatter_mask, count);
         __bang_gt_bitindex((float *)nram_mask_bitindex,
@@ -582,9 +585,10 @@ __mlu_global__ void mluCalcPointsPerVoxel(
                       gather_mask, deal_num);
         __bang_mul_scalar(nram_temp_mask, nram_temp_mask, sizeof(int32_t),
                           deal_num);
-        __scatter(coor_to_voxelidx, gather_output,
-                  (unsigned int *)nram_temp_mask, nram_mask_bitindex,
-                  sizeof(int32_t), NRAM2GDRAM, sizeof(int32_t), count);
+        __mluop_scatter<int32_t>(coor_to_voxelidx, gather_output,
+                                 (unsigned int *)nram_temp_mask,
+                                 (uint8_t *)nram_mask_bitindex, sizeof(int32_t),
+                                 NRAM2GDRAM, sizeof(int32_t), count);
 
         // step4: compute num_points_per_voxel
         for (int32_t i = 0; i < count; i++) {

From 662a162aa448ff1b800f2a109c83a6823ad7b4f8 Mon Sep 17 00:00:00 2001
From: niyuming <ni_yuming@163.com>
Date: Fri, 6 Dec 2024 15:37:44 +0800
Subject: [PATCH 4/7] [Fix](mluOpExecFFT): fix core dump, scale factor and one
 point compute error (#1159)

Co-authored-by: niyuming <niyuming@cambricon.com>
---
 kernels/fft/c2c_fft/c2c_fft_host.cpp          | 260 ++++++------
 kernels/fft/common/fft_basic_ops.cpp          |  16 +-
 kernels/fft/common/fft_common_kernels.mlu     |   3 +-
 kernels/fft/fft.cpp                           | 114 ++++--
 kernels/fft/fft.h                             |  30 +-
 .../fft_optm_device/fft_c2c_stockham_nram.h   | 373 ------------------
 .../fft_two-level_network_c2c_device.mlu      |  48 +--
 .../fft_two-level_network_c2r_device.mlu      |  24 +-
 .../fft_two-level_network_r2c_device.mlu      |  26 +-
 kernels/fft/irfft/irfft_host.cpp              | 245 ++++++------
 kernels/fft/rfft/rfft_host.cpp                | 268 +++++++------
 .../tensor_stride_process_host.cpp            |   3 +-
 12 files changed, 561 insertions(+), 849 deletions(-)

diff --git a/kernels/fft/c2c_fft/c2c_fft_host.cpp b/kernels/fft/c2c_fft/c2c_fft_host.cpp
index 29c53d61f..0f7fc3a6f 100644
--- a/kernels/fft/c2c_fft/c2c_fft_host.cpp
+++ b/kernels/fft/c2c_fft/c2c_fft_host.cpp
@@ -648,13 +648,13 @@ mluOpStatus_t setFFT2dReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
     }
   }
 
-  int _n0 = fft_plan->n[0];
-  int _n1 = fft_plan->n[1];
+  int n0_ori = fft_plan->n[0];
+  int n1_ori = fft_plan->n[1];
 
   if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
     size_t factors_size = FFT_MAXFACTORS * sizeof(int);  // bytes
-    size_t twiddles_size = CPX_TYPE_SIZE * _n1;
-    size_t twiddles_size_2d = CPX_TYPE_SIZE * _n0;
+    size_t twiddles_size = CPX_TYPE_SIZE * n1_ori;
+    size_t twiddles_size_2d = CPX_TYPE_SIZE * n0_ori;
 
     size_t reservespace_offset = 0;
 
@@ -794,19 +794,20 @@ mluOpStatus_t setFFT2dReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
         size_t reservespace_offset = 0;
         fft_plan->mlu_addrs.dft_matrix =
             (uint8_t *)fft_plan->reservespace_addr + reservespace_offset;
-        reservespace_offset += CPX_TYPE_SIZE * (_n1 / 2 + 1) * _n1;
+        reservespace_offset += CPX_TYPE_SIZE * (n1_ori / 2 + 1) * n1_ori;
 
         fft_plan->mlu_addrs.dft_matrix_2d =
             (uint8_t *)fft_plan->reservespace_addr + reservespace_offset;
-        reservespace_offset += CPX_TYPE_SIZE * _n0 * _n0;
+        reservespace_offset += CPX_TYPE_SIZE * n0_ori * n0_ori;
 
         CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix,
                                    fft_plan->dft_matrix,
-                                   CPX_TYPE_SIZE * (_n1 / 2 + 1) * _n1,
+                                   CPX_TYPE_SIZE * (n1_ori / 2 + 1) * n1_ori,
+                                   handle->queue, cnrtMemcpyHostToDev));
+        CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix_2d,
+                                   fft_plan->dft_matrix_2d,
+                                   CPX_TYPE_SIZE * n0_ori * n0_ori,
                                    handle->queue, cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpyAsync(
-            fft_plan->mlu_addrs.dft_matrix_2d, fft_plan->dft_matrix_2d,
-            CPX_TYPE_SIZE * _n0 * _n0, handle->queue, cnrtMemcpyHostToDev));
       } break;
       case CNFFT_COMPLEX_HALF2COMPLEX_HALF:
       case CNFFT_COMPLEX_FLOAT2COMPLEX_FLOAT: {
@@ -814,34 +815,38 @@ mluOpStatus_t setFFT2dReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
         size_t reservespace_offset = 0;
         fft_plan->mlu_addrs.dft_matrix =
             (uint8_t *)fft_plan->reservespace_addr + reservespace_offset;
-        reservespace_offset += CPX_TYPE_SIZE * _n1 * _n1;
+        reservespace_offset += CPX_TYPE_SIZE * n1_ori * n1_ori;
 
         fft_plan->mlu_addrs.dft_matrix_2d =
             (uint8_t *)fft_plan->reservespace_addr + reservespace_offset;
-        reservespace_offset += CPX_TYPE_SIZE * _n0 * _n0;
+        reservespace_offset += CPX_TYPE_SIZE * n0_ori * n0_ori;
 
         fft_plan->mlu_addrs.idft_matrix =
             (uint8_t *)fft_plan->reservespace_addr + reservespace_offset;
-        reservespace_offset += CPX_TYPE_SIZE * _n1 * _n1;
+        reservespace_offset += CPX_TYPE_SIZE * n1_ori * n1_ori;
 
         fft_plan->mlu_addrs.idft_matrix_2d =
             (uint8_t *)fft_plan->reservespace_addr + reservespace_offset;
-        reservespace_offset += CPX_TYPE_SIZE * _n0 * _n0;
-
-        CNRT_CHECK(cnrtMemcpyAsync(
-            fft_plan->mlu_addrs.dft_matrix, fft_plan->dft_matrix,
-            CPX_TYPE_SIZE * _n1 * _n1, handle->queue, cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpyAsync(
-            fft_plan->mlu_addrs.dft_matrix_2d, fft_plan->dft_matrix_2d,
-
-            CPX_TYPE_SIZE * _n0 * _n0, handle->queue, cnrtMemcpyHostToDev));
-
-        CNRT_CHECK(cnrtMemcpyAsync(
-            fft_plan->mlu_addrs.idft_matrix, fft_plan->idft_matrix,
-            CPX_TYPE_SIZE * _n1 * _n1, handle->queue, cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpyAsync(
-            fft_plan->mlu_addrs.idft_matrix_2d, fft_plan->idft_matrix_2d,
-            CPX_TYPE_SIZE * _n0 * _n0, handle->queue, cnrtMemcpyHostToDev));
+        reservespace_offset += CPX_TYPE_SIZE * n0_ori * n0_ori;
+
+        CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix,
+                                   fft_plan->dft_matrix,
+                                   CPX_TYPE_SIZE * n1_ori * n1_ori,
+                                   handle->queue, cnrtMemcpyHostToDev));
+        CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix_2d,
+                                   fft_plan->dft_matrix_2d,
+
+                                   CPX_TYPE_SIZE * n0_ori * n0_ori,
+                                   handle->queue, cnrtMemcpyHostToDev));
+
+        CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.idft_matrix,
+                                   fft_plan->idft_matrix,
+                                   CPX_TYPE_SIZE * n1_ori * n1_ori,
+                                   handle->queue, cnrtMemcpyHostToDev));
+        CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.idft_matrix_2d,
+                                   fft_plan->idft_matrix_2d,
+                                   CPX_TYPE_SIZE * n0_ori * n0_ori,
+                                   handle->queue, cnrtMemcpyHostToDev));
       }; break;
       case CNFFT_COMPLEX_HALF2HALF:
       case CNFFT_COMPLEX_FLOAT2FLOAT: {
@@ -849,19 +854,20 @@ mluOpStatus_t setFFT2dReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
         size_t reservespace_offset = 0;
         fft_plan->mlu_addrs.dft_matrix =
             (uint8_t *)fft_plan->reservespace_addr + reservespace_offset;
-        reservespace_offset += CPX_TYPE_SIZE * (_n1 / 2 + 1) * _n1;
+        reservespace_offset += CPX_TYPE_SIZE * (n1_ori / 2 + 1) * n1_ori;
 
         fft_plan->mlu_addrs.dft_matrix_2d =
             (uint8_t *)fft_plan->reservespace_addr + reservespace_offset;
-        reservespace_offset += CPX_TYPE_SIZE * _n0 * _n0;
+        reservespace_offset += CPX_TYPE_SIZE * n0_ori * n0_ori;
 
         CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix,
                                    fft_plan->dft_matrix,
-                                   CPX_TYPE_SIZE * (_n1 / 2 + 1) * _n1,
+                                   CPX_TYPE_SIZE * (n1_ori / 2 + 1) * n1_ori,
+                                   handle->queue, cnrtMemcpyHostToDev));
+        CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix_2d,
+                                   fft_plan->dft_matrix_2d,
+                                   CPX_TYPE_SIZE * n0_ori * n0_ori,
                                    handle->queue, cnrtMemcpyHostToDev));
-        CNRT_CHECK(cnrtMemcpyAsync(
-            fft_plan->mlu_addrs.dft_matrix_2d, fft_plan->dft_matrix_2d,
-            CPX_TYPE_SIZE * _n0 * _n0, handle->queue, cnrtMemcpyHostToDev));
       }; break;
       default: {
         LOG(ERROR) << make_plan_api << ": invalid 2d fft type.";
@@ -1060,13 +1066,13 @@ static void configureFFT2dWorkspaceAddrs(mluOpHandle_t handle,
   size_t out_c_dtype_size = mluOpDataTypeBytes(out_c_dtype);
 
   int batch = fft_plan->batch;
-  int _n0 = fft_plan->n[0];
-  int _n1 = fft_plan->n[1];
+  int n0_ori = fft_plan->n[0];
+  int n1_ori = fft_plan->n[1];
 
   size_t offset = 0;
   if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
     // rr ri ir ii
-    size_t buffer_size = batch * in_c_dtype_size * _n0 * _n1 * 2;
+    size_t buffer_size = batch * in_c_dtype_size * n0_ori * n1_ori * 2;
     fft_plan->mlu_addrs.input = input;
     fft_plan->mlu_addrs.output = output;
     fft_plan->mlu_addrs.buffer_in = (uint8_t *)workspace + offset;
@@ -1077,27 +1083,29 @@ static void configureFFT2dWorkspaceAddrs(mluOpHandle_t handle,
 
   if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
     fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset;
-    offset += batch * in_c_dtype_size * _n0 * _n1 * 2;
+    offset += batch * in_c_dtype_size * n0_ori * n1_ori * 2;
 
-    if (fft_plan->is_input_contiguous) {
+    if ((fft_plan->is_input_contiguous &&
+         fft_plan->inembed[0] <= fft_plan->n[0] &&
+         fft_plan->inembed[1] <= fft_plan->n[1])) {
       fft_plan->mlu_addrs.input = input;
     } else {
       fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset;
-      offset += batch * in_c_dtype_size * _n0 * _n1;
+      offset += batch * in_c_dtype_size * n0_ori * n1_ori;
     }
 
     if (fft_plan->is_output_contiguous) {
       fft_plan->mlu_addrs.output = output;
     } else {
       fft_plan->mlu_addrs.output = (uint8_t *)workspace + offset;
-      offset += batch * in_c_dtype_size * _n0 * _n1;
+      offset += batch * in_c_dtype_size * n0_ori * n1_ori;
     }
   }
   if (fft_plan->n[0] > fft_plan->inembed[0] ||
       fft_plan->n[1] > fft_plan->inembed[1]) {
     fft_plan->mlu_addrs.input_pad_addr =
-        (uint8_t *)workspace +
-        offset;  // batch * in_c_dtype_size * _n0 * _n1 * 2; // buffer_size;
+        (uint8_t *)workspace + offset;  // batch * in_c_dtype_size * n0_ori *
+                                        // n1_ori * 2; // buffer_size;
   }
 }
 // input    : in input
@@ -1115,11 +1123,11 @@ static mluOpStatus_t makeFFT1dContiguousInput(mluOpHandle_t handle,
       status = mluOpCreateTensorDescriptor(&input_desc);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-      const int in_dim_num = 2;
-      int64_t dims[in_dim_num] = {fft_plan->batch, fft_plan->inembed[0]};
-      int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride};
+      const int IN_DIM_NUM = 2;
+      int64_t dims[IN_DIM_NUM] = {fft_plan->batch, fft_plan->inembed[0]};
+      int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride};
       status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                             fft_plan->input_dtype, in_dim_num,
+                                             fft_plan->input_dtype, IN_DIM_NUM,
                                              dims, strides);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -1140,12 +1148,12 @@ static mluOpStatus_t makeFFT1dContiguousInput(mluOpHandle_t handle,
       status = mluOpCreateTensorDescriptor(&input_desc);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-      const int in_dim_num = 2;
-      int64_t dims[in_dim_num] = {
+      const int IN_DIM_NUM = 2;
+      int64_t dims[IN_DIM_NUM] = {
           fft_plan->batch, std::min(fft_plan->n[0], fft_plan->inembed[0])};
-      int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride};
+      int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride};
       status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                             fft_plan->input_dtype, in_dim_num,
+                                             fft_plan->input_dtype, IN_DIM_NUM,
                                              dims, strides);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -1176,15 +1184,17 @@ static mluOpStatus_t makeFFT2dContiguousInput(mluOpHandle_t handle,
     status = mluOpCreateTensorDescriptor(&input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 3;
-    int64_t dims[in_dim_num] = {fft_plan->batch,
+    const int IN_DIM_NUM = 3;
+    int64_t dims[IN_DIM_NUM] = {fft_plan->batch,
                                 std::min(fft_plan->n[0], fft_plan->inembed[0]),
                                 std::min(fft_plan->n[1], fft_plan->inembed[1])};
-    int64_t strides[in_dim_num] = {fft_plan->idist,
-                                   (fft_plan->istride * fft_plan->inembed[1]),
-                                   fft_plan->istride};
+
+    int64_t strides[IN_DIM_NUM];  // IN_DIM_NUM
+    for (int i = 0; i < IN_DIM_NUM; i++) {
+      strides[i] = fft_plan->in_stride[i];
+    }
     status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                           fft_plan->input_dtype, in_dim_num,
+                                           fft_plan->input_dtype, IN_DIM_NUM,
                                            dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -1220,15 +1230,15 @@ static mluOpStatus_t padFFT1dContiguousInput(mluOpHandle_t handle,
     status = mluOpCreateTensorDescriptor(&padded_input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 2;
-    int64_t dims[in_dim_num] = {batch, fft_plan->inembed[0] * COMPLEX};
+    const int IN_DIM_NUM = 2;
+    int64_t dims[IN_DIM_NUM] = {batch, fft_plan->inembed[0] * COMPLEX};
     status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, dims);
+                                         in_r_dtype, IN_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    int64_t padded_dims[in_dim_num] = {batch, n * COMPLEX};
+    int64_t padded_dims[IN_DIM_NUM] = {batch, n * COMPLEX};
     status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, padded_dims);
+                                         in_r_dtype, IN_DIM_NUM, padded_dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     const int pad_dim_num = 4;
@@ -1284,16 +1294,16 @@ static mluOpStatus_t padFFT2dContiguousInput(mluOpHandle_t handle,
     status = mluOpCreateTensorDescriptor(&padded_input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 3;
-    int64_t dims[in_dim_num] = {batch, std::min(fft_plan->inembed[0], n0),
+    const int IN_DIM_NUM = 3;
+    int64_t dims[IN_DIM_NUM] = {batch, std::min(fft_plan->inembed[0], n0),
                                 std::min(fft_plan->inembed[1], n1) * COMPLEX};
     status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, dims);
+                                         in_r_dtype, IN_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    int64_t padded_dims[in_dim_num] = {batch, n0, n1 * COMPLEX};
+    int64_t padded_dims[IN_DIM_NUM] = {batch, n0, n1 * COMPLEX};
     status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, padded_dims);
+                                         in_r_dtype, IN_DIM_NUM, padded_dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     const int pad_dim_num = 6;
@@ -1756,17 +1766,17 @@ static mluOpStatus_t makeFFT1dContiguousOutput(mluOpHandle_t handle,
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // set up tensor desc
-    const int out_dim_num = 2;
-    int64_t dims[out_dim_num] = {fft_plan->batch, (fft_plan->prime)
+    const int OUT_DIM_NUM = 2;
+    int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, (fft_plan->prime)
                                                       ? fft_plan->onembed[0]
                                                       : fft_plan->n[0]};
-    int64_t strides[out_dim_num] = {fft_plan->odist, fft_plan->ostride};
+    int64_t strides[OUT_DIM_NUM] = {fft_plan->odist, fft_plan->ostride};
     status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
-                                         out_c_dtype, out_dim_num, dims);
+                                         out_c_dtype, OUT_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     status =
         mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY,
-                                      out_c_dtype, out_dim_num, dims, strides);
+                                      out_c_dtype, OUT_DIM_NUM, dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     void *copy_src_addr = (fft_plan->prime)
@@ -1779,17 +1789,8 @@ static mluOpStatus_t makeFFT1dContiguousOutput(mluOpHandle_t handle,
                                                  cnnl_copy_src_desc);
     DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
                                                  cnnl_copy_dst_desc);
-    size_t workspace_size = 0;
-    CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
-                                       cnnl_copy_dst_desc, &workspace_size));
-
-    void *workspace = nullptr;
-    if (workspace_size > 0) {
-      CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
-    }
     CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
-                          cnnl_copy_dst_desc, output, workspace,
-                          workspace_size));
+                          cnnl_copy_dst_desc, output, NULL, 0));
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
     DESTROY_CNNL_HANDLE(cnnl_handle);
@@ -1815,18 +1816,19 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle,
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // set up tensor desc
-    const int out_dim_num = 3;
-    int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
+    const int OUT_DIM_NUM = 3;
+    int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0],
                                  fft_plan->n[1]};
-    int64_t strides[out_dim_num] = {fft_plan->odist,
-                                    fft_plan->ostride * fft_plan->onembed[1],
-                                    fft_plan->ostride};
+    int64_t strides[OUT_DIM_NUM];  // OUT_DIM_NUM
+    for (int i = 0; i < OUT_DIM_NUM; i++) {
+      strides[i] = fft_plan->out_stride[i];
+    }
     status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
-                                         out_c_dtype, out_dim_num, dims);
+                                         out_c_dtype, OUT_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     status =
         mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY,
-                                      out_c_dtype, out_dim_num, dims, strides);
+                                      out_c_dtype, OUT_DIM_NUM, dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // void *copy_src_addr = fft_plan->matmul_addrs.output_contiguous_addr;
@@ -1838,18 +1840,8 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle,
                                                  cnnl_copy_src_desc);
     DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
                                                  cnnl_copy_dst_desc);
-
-    size_t workspace_size = 0;
-    CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
-                                       cnnl_copy_dst_desc, &workspace_size));
-
-    void *workspace = nullptr;
-    if (workspace_size > 0) {
-      CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
-    }
     CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
-                          cnnl_copy_dst_desc, output, workspace,
-                          workspace_size));
+                          cnnl_copy_dst_desc, output, NULL, 0));
 
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
@@ -2003,12 +1995,16 @@ mluOpStatus_t execFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
       const float beta[2] = {0.0, 0.0};
       mluOpTensorDescriptor_t c_desc = nullptr;
       status = mluOpCreateTensorDescriptor(&c_desc);
-      const int out_dim_num = 2;
-      int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0]};
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+      const int OUT_DIM_NUM = 2;
+      int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0]};
       status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
-                                           fft_plan->output_dtype, 2, dims);
+                                           fft_plan->output_dtype, OUT_DIM_NUM,
+                                           dims);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
       status = mluOpSetTensorDescriptorOnchipDataType(
           c_desc, fft_plan->execution_dtype);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
       DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
                                         cnnl_handle);  // convert to cnnl_handle
@@ -2019,6 +2015,8 @@ mluOpStatus_t execFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
                                  cnnl_output_desc, fft_plan->mlu_addrs.output,
                                  &beta, cnnl_output_desc,
                                  fft_plan->mlu_addrs.output));
+      status = mluOpDestroyTensorDescriptor(c_desc);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
       DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
       DESTROY_CNNL_HANDLE(cnnl_handle);
     }
@@ -2053,7 +2051,34 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
     fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr;
   }
 
-  status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
+  if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) {
+    mluOpTensorDescriptor_t c_desc = nullptr;
+    status = mluOpCreateTensorDescriptor(&c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    const int OUT_DIM_NUM = 3;
+    int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0],
+                                 fft_plan->n[1]};
+    status = mluOpSetTensorDescriptor_v2(
+        c_desc, MLUOP_LAYOUT_ARRAY, fft_plan->output_dtype, OUT_DIM_NUM, dims);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    status = mluOpSetTensorDescriptorOnchipDataType(c_desc,
+                                                    fft_plan->execution_dtype);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+
+    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
+                                      cnnl_handle);  // convert to cnnl_handle
+
+    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc);
+    CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_output_desc,
+                          fft_plan->mlu_addrs.input, cnnl_output_desc,
+                          fft_plan->mlu_addrs.output, NULL, 0));
+    status = mluOpDestroyTensorDescriptor(c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
+    DESTROY_CNNL_HANDLE(cnnl_handle);
+  } else {
+    status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
+  }
 
   INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -2062,13 +2087,16 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
     const float beta[2] = {0.0, 0.0};
     mluOpTensorDescriptor_t c_desc = nullptr;
     status = mluOpCreateTensorDescriptor(&c_desc);
-    const int out_dim_num = 3;
-    int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    const int OUT_DIM_NUM = 3;
+    int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0],
                                  fft_plan->n[1]};
-    status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
-                                         fft_plan->output_dtype, 3, dims);
+    status = mluOpSetTensorDescriptor_v2(
+        c_desc, MLUOP_LAYOUT_ARRAY, fft_plan->output_dtype, OUT_DIM_NUM, dims);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     status = mluOpSetTensorDescriptorOnchipDataType(c_desc,
                                                     fft_plan->execution_dtype);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
                                       cnnl_handle);  // convert to cnnl_handle
@@ -2079,6 +2107,8 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
                                cnnl_output_desc, fft_plan->mlu_addrs.output,
                                &beta, cnnl_output_desc,
                                fft_plan->mlu_addrs.output));
+    status = mluOpDestroyTensorDescriptor(c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
     DESTROY_CNNL_HANDLE(cnnl_handle);
   }
@@ -2296,11 +2326,11 @@ mluOpStatus_t computeFFT2dMatMulRow(mluOpHandle_t handle,
   int requested_algo_count = 1, return_algo_count = 0;
   float *workspace;
   size_t workspace_size;
-  cnnlGetBatchMatMulAlgoHeuristic(
+  cnnlGetBatchMatMulExAlgoHeuristic(
       cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
       requested_algo_count, &heuristic_result, &return_algo_count);
 
-  cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
+  cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
 
   if (workspace_size > 0) {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
@@ -2308,10 +2338,10 @@ mluOpStatus_t computeFFT2dMatMulRow(mluOpHandle_t handle,
     CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
   }
 
-  CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
-                                    cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
-                                    in_addr, &beta, cnnl_c_desc, out_addr,
-                                    (void *)workspace, workspace_size));
+  CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
+                              cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
+                              in_addr, &beta, cnnl_c_desc, out_addr,
+                              (void *)workspace, workspace_size));
   // destroy cnnl descriptor
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
diff --git a/kernels/fft/common/fft_basic_ops.cpp b/kernels/fft/common/fft_basic_ops.cpp
index b928cfe13..39eae3b71 100644
--- a/kernels/fft/common/fft_basic_ops.cpp
+++ b/kernels/fft/common/fft_basic_ops.cpp
@@ -495,10 +495,10 @@ mluOpStatus_t fftGetBatchMatMulBcastWorkspaceSize(
   cnnlMatMulHeuristicResult_t heuristic_result;
   CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result));
   int requested_algo_count = 1, return_algo_count = 0;
-  cnnlGetBatchMatMulAlgoHeuristic(
+  cnnlGetBatchMatMulExAlgoHeuristic(
       cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
       requested_algo_count, &heuristic_result, &return_algo_count);
-  cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
+  cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
   // destroy descriptor
   // destroy cnnl descriptor
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
@@ -595,20 +595,20 @@ mluOpStatus_t fftBatchMatMulBcast(
   alpha = 1.0;
   beta = 0.0;
   int requested_algo_count = 1, return_algo_count = 0;
-  cnnlGetBatchMatMulAlgoHeuristic(
+  cnnlGetBatchMatMulExAlgoHeuristic(
       cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
       requested_algo_count, &heuristic_result, &return_algo_count);
-  cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
+  cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
   if (workspace_size > 0) {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
   } else {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
   }
 
-  CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
-                                    cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
-                                    &beta, cnnl_c_desc, c_ptr,
-                                    (void *)workspace, workspace_size));
+  CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
+                              cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
+                              &beta, cnnl_c_desc, c_ptr,
+                              (void *)workspace, workspace_size));
   // destroy descriptor
   // destroy cnnl descriptor
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
diff --git a/kernels/fft/common/fft_common_kernels.mlu b/kernels/fft/common/fft_common_kernels.mlu
index 8cca3a697..d9e48157d 100644
--- a/kernels/fft/common/fft_common_kernels.mlu
+++ b/kernels/fft/common/fft_common_kernels.mlu
@@ -109,7 +109,8 @@ __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
   __asm__ volatile(
       "gather.clean.nram.nram.nram.b32.u32 "
       "[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
-      [src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
+      [ src ] "r"(src_addr), [ offset ] "r"(offset_int_addr),
+      [ data_num ] "r"(deal_size));
 #else
   for (auto i = 0; i < deal_size; i++) {
     dst_addr[i] = src_addr[offset_int_addr[i]];
diff --git a/kernels/fft/fft.cpp b/kernels/fft/fft.cpp
index 4d4ab9ef1..8886e0453 100644
--- a/kernels/fft/fft.cpp
+++ b/kernels/fft/fft.cpp
@@ -1657,7 +1657,7 @@ mluOpAllocateC2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
                      fft_plan->is_batch_contiguous)
                         ? 0
                         : buffer_size;
-  if (fft_plan->n[0] > fft_plan->inembed[0]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0]) {
     workspace_size += buffer_size;
   }
   size_t twiddles_size = in_c_dtype_size * nfft * 2;
@@ -1701,7 +1701,7 @@ mluOpAllocateR2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
   reservespace_size = sizeof(int) * (FFT_MAXFACTORS)            /* factors */
                       + twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */
 
-  if (fft_plan->n[0] > fft_plan->inembed[0]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0]) {
     workspace_size += buffer_size;  // input_pad_addr
   }
   fft_plan->workspace_size = workspace_size;
@@ -1721,18 +1721,18 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D(
   size_t in_c_dtype_size = mluOpDataTypeBytes(in_c_dtype);
 
   int batch = fft_plan->batch;
-  const int _n0 = fft_plan->n[0];
-  const int _n1 = fft_plan->n[1];
+  const int n0_ori = fft_plan->n[0];
+  const int n1_ori = fft_plan->n[1];
 
-  size_t buffer_size = batch * in_c_dtype_size * _n0 * _n1;
+  size_t buffer_size = batch * in_c_dtype_size * n0_ori * n1_ori;
 
-  size_t twiddles_size = in_c_dtype_size * _n0;
-  size_t twiddles_size_2d = in_c_dtype_size * _n1;
+  size_t twiddles_size = in_c_dtype_size * n0_ori;
+  size_t twiddles_size_2d = in_c_dtype_size * n1_ori;
 
   if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
-    reservespace_size =
-        (in_c_dtype_size * _n0 * _n0 + in_c_dtype_size * _n1 * _n1) *
-        2; /* DFT matrix */
+    reservespace_size = (in_c_dtype_size * n0_ori * n0_ori +
+                         in_c_dtype_size * n1_ori * n1_ori) *
+                        2; /* DFT matrix */
     workspace_size = buffer_size * 6;
   } else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
     reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
@@ -1740,13 +1740,17 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D(
                         DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
                         DFT_TABLE_SIZE * 2; /* twiddles */
     workspace_size = buffer_size * 2;
-    workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
+    workspace_size += (fft_plan->is_input_contiguous &&
+                       fft_plan->inembed[0] <= fft_plan->n[0] &&
+                       fft_plan->inembed[1] <= fft_plan->n[1])
+                          ? 0
+                          : buffer_size;
     workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
   }
 
   fft_plan->workspace_size = workspace_size;
-  if (fft_plan->n[0] > fft_plan->inembed[0] ||
-      fft_plan->n[1] > fft_plan->inembed[1]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0] ||
+      fft_plan->n[1] != fft_plan->inembed[1]) {
     fft_plan->workspace_size = workspace_size + buffer_size;  // input_pad_addr
   }
   fft_plan->reservespace_size = reservespace_size;
@@ -1783,7 +1787,7 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
   reservespace_size = sizeof(int) * (FFT_MAXFACTORS)            /* factors */
                       + twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */
 
-  if (fft_plan->n[0] > fft_plan->inembed[0]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0]) {
     workspace_size += buffer_size;  // input_pad_addr
   }
   fft_plan->workspace_size = workspace_size;
@@ -1791,11 +1795,58 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
 
   return MLUOP_STATUS_SUCCESS;
 }
+mluOpStatus_t MLUOP_WIN_API mluOpAllocateIRFFT2D(
+    mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
+    mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
+    const int n0_ori, const int n1_ori) {
+  const std::string make_plan_api = "[mluOpAllocateIRFFT2D]";
+  size_t workspace_size = 0, reservespace_size = 0;
+
+  mluOpDataType_t out_c_dtype = fft_plan->output_dtype;
+  mluOpDataType_t in_c_dtype = fft_plan->input_dtype;
+  size_t complex_dtype_size =
+      (mluOpDataTypeBytes(out_c_dtype) > mluOpDataTypeBytes(in_c_dtype))
+          ? mluOpDataTypeBytes(out_c_dtype)
+          : mluOpDataTypeBytes(in_c_dtype);
+
+  int batch = fft_plan->batch;
+  size_t buffer_size = batch * complex_dtype_size * n0_ori * n1_ori;
+
+  size_t twiddles_size = complex_dtype_size * n0_ori;
+  size_t twiddles_size_2d = complex_dtype_size * n1_ori;
+
+  if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
+    reservespace_size =
+        complex_dtype_size * n0_ori * n0_ori * 2 +
+        complex_dtype_size * n1_ori * n1_ori * 2; /* DFT matrix */
+    workspace_size = complex_dtype_size * n1_ori * n0_ori * batch * 6;
+  } else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
+    reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+                        + sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
+                        DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
+                        DFT_TABLE_SIZE * 2; /* twiddles */
+    workspace_size = buffer_size * 2;
+    workspace_size += (fft_plan->is_input_contiguous &&
+                       fft_plan->inembed[0] <= fft_plan->n[0] &&
+                       fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1)
+                          ? 0
+                          : buffer_size;
+    workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
+  }
+
+  if (fft_plan->n[0] != fft_plan->inembed[0] ||
+      fft_plan->n[1] != fft_plan->inembed[1]) {
+    workspace_size += buffer_size;
+  }
+  fft_plan->workspace_size = workspace_size;
+  fft_plan->reservespace_size = reservespace_size;
 
+  return MLUOP_STATUS_SUCCESS;
+}
 mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
     mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
     mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
-    const int _n0, const int _n1) {
+    const int n0_ori, const int n1_ori) {
   const std::string make_plan_api = "[mluOpAllocateRFFT2D]";
   size_t workspace_size = 0, reservespace_size = 0;
 
@@ -1807,27 +1858,32 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
           : mluOpDataTypeBytes(in_c_dtype);
 
   int batch = fft_plan->batch;
-  size_t buffer_size = batch * complex_dtype_size * _n0 * _n1;
+  size_t buffer_size = batch * complex_dtype_size * n0_ori * n1_ori;
 
-  size_t twiddles_size = complex_dtype_size * _n0;
-  size_t twiddles_size_2d = complex_dtype_size * _n1;
+  size_t twiddles_size = complex_dtype_size * n0_ori;
+  size_t twiddles_size_2d = complex_dtype_size * n1_ori;
 
   if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
-    reservespace_size = complex_dtype_size * _n0 * _n0 * 2 +
-                        complex_dtype_size * _n1 * _n1 * 2; /* DFT matrix */
-    workspace_size = complex_dtype_size * _n1 * _n0 * batch * 6;
+    reservespace_size =
+        complex_dtype_size * n0_ori * n0_ori * 2 +
+        complex_dtype_size * n1_ori * n1_ori * 2; /* DFT matrix */
+    workspace_size = complex_dtype_size * n1_ori * n0_ori * batch * 6;
   } else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
     reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
                         + sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
                         DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
                         DFT_TABLE_SIZE * 2; /* twiddles */
     workspace_size = buffer_size * 2;
-    workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
+    workspace_size += (fft_plan->is_input_contiguous &&
+                       fft_plan->inembed[0] <= fft_plan->n[0] &&
+                       fft_plan->inembed[1] <= fft_plan->n[1])
+                          ? 0
+                          : buffer_size;
     workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
   }
 
-  if (fft_plan->n[0] > fft_plan->inembed[0] ||
-      fft_plan->n[1] > fft_plan->inembed[1]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0] ||
+      fft_plan->n[1] != fft_plan->inembed[1]) {
     workspace_size += buffer_size;
   }
   fft_plan->workspace_size = workspace_size;
@@ -1846,6 +1902,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2C1D(
     const int rank, const int *n) {
   fft_plan->is_batch_contiguous =
       (fft_plan->idist == 1 && fft_plan->odist == 1 &&
+       fft_plan->inembed[0] == fft_plan->n[0] &&
+       fft_plan->onembed[0] == fft_plan->n[0] &&
        fft_plan->istride == fft_plan->batch &&
        fft_plan->ostride == fft_plan->batch) &&
       (fft_plan->n[0] == fft_plan->inembed[0]);
@@ -2221,7 +2279,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2R2D(
     fft_plan->fft_strategy = CNFFT_FUNC_TWO_LEVEL_STOCKHAM;
   }
 
-  mluOpAllocateRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);
+  mluOpAllocateIRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);
 
   if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
     switch (fft_plan->fft_type) {
@@ -2394,6 +2452,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanMany(
     fft_plan->inembed[i] = input_desc->dims[fft_plan->idim - rank + i];
     fft_plan->onembed[i] = output_desc->dims[fft_plan->odim - rank + i];
   }
+  for (auto i = 0; i < fft_plan->idim; i++) {
+    fft_plan->in_stride[i] = input_desc->strides[i];
+  }
+  for (auto i = 0; i < fft_plan->odim; i++) {
+    fft_plan->out_stride[i] = output_desc->strides[i];
+  }
   if (fft_plan->idim == rank + 1) {
     fft_plan->idist = input_desc->strides[0];
     fft_plan->odist = output_desc->strides[0];
diff --git a/kernels/fft/fft.h b/kernels/fft/fft.h
index aa7ac0ba6..6f31a7751 100644
--- a/kernels/fft/fft.h
+++ b/kernels/fft/fft.h
@@ -180,6 +180,8 @@ struct cnfftButterflyAddrs {
   int *factors;
   int *factors_2d;
   void *input_pad_addr;
+  void *input_copy_workspace_addr;
+  void *output_copy_workspace_addr;
 };
 struct mluOpFFTStruct {
   int rank;            // rank of FFT
@@ -193,24 +195,26 @@ struct mluOpFFTStruct {
   int inum;                  // element num of input tensor
   int istride;  // distance between two successive input elements in the
                 // innermost dimension
-  int idist;    // distance between the first element of two consecutive signals
-                // in a batch of the input data
-  int odim;     // the dimension size of output tensor
+  int in_stride[FFT_DIM_MAX + 1];
+  int idist;  // distance between the first element of two consecutive signals
+              // in a batch of the input data
+  int odim;   // the dimension size of output tensor
   int onembed[FFT_DIM_MAX];  // Pointer of size rank that indicates the storage
                              // dimensions of the output data in memory
   int onum;                  // element num of output tensor
   int ostride;  // distance between two successive output elements in the
                 // innermost dimension
-  int odist;    // distance between the first element of two consecutive signals
-                // in a batch of the output data
-  int batch;    // batch size for this transform
-  int L;        // n = L * 2^m, L size for this transform
-  int m;        // n = L * 2^m, m size for this transform
-  int s;        // The size that can be put down on NRAM: L * 2^s, only used by
-                // Cooley-Tukey algorithm
-  int L_sub;    // The size that can be put down on NRAM: L_sub * 2^m, only used
-                // by  Stockham algorithm
-  int prime;    // wether fft1d'size contains a prime number > 64
+  int out_stride[FFT_DIM_MAX + 1];
+  int odist;  // distance between the first element of two consecutive signals
+              // in a batch of the output data
+  int batch;  // batch size for this transform
+  int L;      // n = L * 2^m, L size for this transform
+  int m;      // n = L * 2^m, m size for this transform
+  int s;      // The size that can be put down on NRAM: L * 2^s, only used by
+              // Cooley-Tukey algorithm
+  int L_sub;  // The size that can be put down on NRAM: L_sub * 2^m, only used
+              // by  Stockham algorithm
+  int prime;  // wether fft1d'size contains a prime number > 64
   bool is_input_contiguous;
   bool is_output_contiguous;
   bool is_batch_contiguous;
diff --git a/kernels/fft/fft_optm_device/fft_c2c_stockham_nram.h b/kernels/fft/fft_optm_device/fft_c2c_stockham_nram.h
index 547174631..07d31dea1 100644
--- a/kernels/fft/fft_optm_device/fft_c2c_stockham_nram.h
+++ b/kernels/fft/fft_optm_device/fft_c2c_stockham_nram.h
@@ -305,379 +305,6 @@ __mlu_func__ void computeLargeButterflyFirststageBatchPingpong(
   }
 }
 
-// Compute the large butterfly for the subsequent stages of the FFT
-template <typename DT>
-__mlu_func__ void computeLargeButterflyOtherstages(
-    DT *output, DT *input, const int large_radix, const DT *cur_large_twiddles,
-    const DT *_twiddles, const DT *dft_matrix, const int large_section_num,
-    const int large_butterfly_num, const int large_in_stride, void *nram_buf,
-    const int *small_factors, const int nfft, const int dir,
-    const int last_stage) {
-  const dft_table_entry *dft_table = (const dft_table_entry *)dft_matrix;
-  const int K_num = 64 / sizeof(DT);
-  int align_K = 0;
-  int radix, small_in_stride, small_stage_count, _small_stage_count;
-  int small_section_num, small_butterfly_num, value_mul;
-
-  const int large_out_stride = large_butterfly_num;
-  int tw_offset;
-
-  _small_stage_count = small_factors[0];
-  tw_offset = small_factors[1];
-
-  const DT *small_twiddles = _twiddles + tw_offset * 2;
-
-  const int max_para_ldst_num = (4096 + large_radix - 1) / large_radix;
-
-  int nram_buf_offset = 0;
-  DT *nram_in_r = (DT *)nram_buf + nram_buf_offset;
-  nram_buf_offset += large_radix * max_para_ldst_num;
-
-  DT *nram_in_i = (DT *)nram_buf + nram_buf_offset;
-  nram_buf_offset += large_radix * max_para_ldst_num;
-
-  DT *nram_out_r = (DT *)nram_buf + nram_buf_offset;
-  nram_buf_offset += large_radix * max_para_ldst_num;
-
-  DT *nram_out_i = (DT *)nram_buf + nram_buf_offset;
-  nram_buf_offset += large_radix * max_para_ldst_num;
-
-  FFT_CPX_T<DT> nram_para_load_in_ping = {
-      (DT *)nram_buf + nram_buf_offset,
-      (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num};
-  nram_buf_offset += large_radix * max_para_ldst_num * 2;
-
-  FFT_CPX_T<DT> nram_para_load_in_pong = {
-      (DT *)nram_buf + nram_buf_offset,
-      (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num};
-  nram_buf_offset += large_radix * max_para_ldst_num * 2;
-
-  FFT_CPX_T<DT> nram_para_load_tw_ping = {
-      (DT *)nram_buf + nram_buf_offset,
-      (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num};
-  nram_buf_offset += large_radix * max_para_ldst_num * 2;
-
-  FFT_CPX_T<DT> nram_para_load_tw_pong = {
-      (DT *)nram_buf + nram_buf_offset,
-      (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num};
-  nram_buf_offset += large_radix * max_para_ldst_num * 2;
-
-  FFT_CPX_T<DT> nram_para_store_ping = {
-      (DT *)nram_buf + nram_buf_offset,
-      (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num};
-  nram_buf_offset += large_radix * max_para_ldst_num * 2;
-
-  FFT_CPX_T<DT> nram_para_store_pong = {
-      (DT *)nram_buf + nram_buf_offset,
-      (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num};
-  nram_buf_offset += large_radix * max_para_ldst_num * 2;
-
-  FFT_CPX_T<DT> nram_transpose_temp;
-  nram_transpose_temp = {
-      (DT *)nram_in_r,
-      (DT *)nram_in_r + large_radix * ((int)last_stage) +
-          large_radix * (1 - (int)last_stage) * max_para_ldst_num};
-
-  DT *_nram_tw = (DT *)nram_buf + nram_buf_offset;
-  nram_buf_offset += large_radix * 2;
-
-  int ld_dft_radix = -1;
-  const int max_radix = 64;
-  DT *nram_dftmtx = (DT *)nram_buf + nram_buf_offset;
-  nram_buf_offset += max_radix * max_radix * 2;
-
-  DT *nram_scratch = (DT *)nram_buf + nram_buf_offset;
-
-  DT *CPX_MUL_RR = nram_scratch;
-  DT *CPX_MUL_RI = &CPX_MUL_RR[large_radix * max_para_ldst_num];
-  DT *CPX_MUL_IR = &CPX_MUL_RI[large_radix * max_para_ldst_num];
-  DT *CPX_MUL_II = &CPX_MUL_IR[large_radix * max_para_ldst_num];
-
-  nram_buf_offset += large_radix * max_para_ldst_num * 4;
-
-  int Fin_stride = 0, Fout_stride = 0;
-  int sec_count;
-  int repeat_num =
-      (large_butterfly_num + max_para_ldst_num - 1) / max_para_ldst_num;
-  for (sec_count = 0; sec_count < large_section_num; ++sec_count) {
-    for (int repeat_id = 0; repeat_id < repeat_num + 2; ++repeat_id) {
-      if (repeat_id < repeat_num) {
-        int i = max_para_ldst_num * repeat_id;
-        FFT_CPX_T<DT> nram_para_load_in = (repeat_id % 2 == 0)
-                                              ? nram_para_load_in_ping
-                                              : nram_para_load_in_pong;
-
-        FFT_CPX_T<DT> nram_para_load_tw = (repeat_id % 2 == 0)
-                                              ? nram_para_load_tw_ping
-                                              : nram_para_load_tw_pong;
-
-        int para_load_num = (max_para_ldst_num > (large_butterfly_num - i))
-                                ? (large_butterfly_num - i)
-                                : max_para_ldst_num;
-
-        __memcpy_async(nram_para_load_in.r, input + Fin_stride + i,
-                       sizeof(DT) * para_load_num, GDRAM2NRAM,
-                       sizeof(DT) * para_load_num, large_in_stride * sizeof(DT),
-                       large_radix - 1);
-        __memcpy_async(nram_para_load_in.i, input + nfft + Fin_stride + i,
-                       sizeof(DT) * para_load_num, GDRAM2NRAM,
-                       sizeof(DT) * para_load_num, large_in_stride * sizeof(DT),
-                       large_radix - 1);
-        __memcpy_async(nram_para_load_tw.r, cur_large_twiddles + i,
-                       sizeof(DT) * para_load_num, SRAM2NRAM,
-                       sizeof(DT) * para_load_num,
-                       large_out_stride * sizeof(DT), large_radix - 2);
-        __memcpy_async(
-            nram_para_load_tw.i,
-            cur_large_twiddles + large_butterfly_num * (large_radix - 1) + i,
-            sizeof(DT) * para_load_num, SRAM2NRAM, sizeof(DT) * para_load_num,
-            large_out_stride * sizeof(DT), large_radix - 2);
-      }
-
-      if (repeat_id >= 2) {
-        int i = max_para_ldst_num * (repeat_id - 2);
-
-        int para_store_num = (max_para_ldst_num > (large_butterfly_num - i))
-                                 ? (large_butterfly_num - i)
-                                 : max_para_ldst_num;
-
-        FFT_CPX_T<DT> nram_para_store =
-            (repeat_id % 2 == 0) ? nram_para_store_ping : nram_para_store_pong;
-
-        if (last_stage) {
-          __memcpy_async(output + (Fout_stride + i) * 2, nram_para_store.r,
-                         sizeof(DT) * 2 * para_store_num, NRAM2GDRAM,
-                         large_out_stride * 2 * sizeof(DT),
-                         sizeof(DT) * 2 * para_store_num, large_radix - 1);
-        } else {
-          __memcpy_async(output + Fout_stride + i, nram_para_store.r,
-                         para_store_num * sizeof(DT), NRAM2GDRAM,
-                         large_out_stride * sizeof(DT),
-                         sizeof(DT) * para_store_num, large_radix - 1);
-          __memcpy_async(output + Fout_stride + i + nfft, nram_para_store.i,
-                         para_store_num * sizeof(DT), NRAM2GDRAM,
-                         large_out_stride * sizeof(DT),
-                         sizeof(DT) * para_store_num, large_radix - 1);
-        }
-      }
-
-      if (repeat_id >= 1 && repeat_id < repeat_num + 1) {
-        int i = max_para_ldst_num * (repeat_id - 1);
-
-        FFT_CPX_T<DT> nram_para_load_in = (repeat_id % 2 != 0)
-                                              ? nram_para_load_in_ping
-                                              : nram_para_load_in_pong;
-
-        FFT_CPX_T<DT> nram_para_load_tw = (repeat_id % 2 != 0)
-                                              ? nram_para_load_tw_ping
-                                              : nram_para_load_tw_pong;
-
-        FFT_CPX_T<DT> nram_para_store =
-            (repeat_id % 2 != 0) ? nram_para_store_ping : nram_para_store_pong;
-
-        int para_ldst_num = (max_para_ldst_num > (large_butterfly_num - i))
-                                ? (large_butterfly_num - i)
-                                : max_para_ldst_num;
-
-        __bang_mul(CPX_MUL_RR, nram_para_load_in.r + para_ldst_num,
-                   nram_para_load_tw.r, para_ldst_num * (large_radix - 1));
-        __bang_mul(CPX_MUL_II, nram_para_load_in.i + para_ldst_num,
-                   nram_para_load_tw.i, para_ldst_num * (large_radix - 1));
-        __bang_mul(CPX_MUL_RI, nram_para_load_in.r + para_ldst_num,
-                   nram_para_load_tw.i, para_ldst_num * (large_radix - 1));
-        __bang_mul(CPX_MUL_IR, nram_para_load_in.i + para_ldst_num,
-                   nram_para_load_tw.r, para_ldst_num * (large_radix - 1));
-
-        __bang_sub(nram_para_load_in.r + para_ldst_num, CPX_MUL_RR, CPX_MUL_II,
-                   para_ldst_num * (large_radix - 1));
-        __bang_add(nram_para_load_in.i + para_ldst_num, CPX_MUL_RI, CPX_MUL_IR,
-                   para_ldst_num * (large_radix - 1));
-
-        {
-          radix = small_factors[4];
-          small_section_num = small_factors[5];
-          small_in_stride = small_factors[7];
-          small_stage_count = _small_stage_count;
-
-          if (ld_dft_radix != radix) {
-            ld_dft_radix = radix;
-            for (int entry = 0;; entry++) {
-              if (dft_table[entry].radix == ld_dft_radix) {
-                align_K = K_num * ((radix + K_num - 1) / K_num);
-                __memcpy_async(
-                    nram_dftmtx, &dft_matrix[dft_table[entry].offset * 2],
-                    sizeof(DT) * 2 * ld_dft_radix * align_K, SRAM2NRAM);
-                __sync_move();
-                break;
-              }
-
-              if (dft_table[entry].radix == -1) {
-                break;
-              }
-            }
-          }
-
-          computeGenericButterflyFirststageMat(
-              nram_out_r, nram_out_i, nram_para_load_in.r, nram_para_load_in.i,
-              nram_scratch, nram_dftmtx, small_section_num * para_ldst_num,
-              small_section_num * para_ldst_num, 1, dir, radix);
-
-          small_stage_count--;
-          if (small_stage_count == 0) {
-            if (last_stage) {
-              __memcpy_async(nram_transpose_temp.r, nram_out_r,
-                             sizeof(DT) * large_radix, NRAM2NRAM,
-                             sizeof(DT) * large_radix * 2,
-                             sizeof(DT) * large_radix, para_ldst_num - 1);
-
-              __memcpy_async(nram_transpose_temp.i, nram_out_i,
-                             sizeof(DT) * large_radix, NRAM2NRAM,
-                             sizeof(DT) * large_radix * 2,
-                             sizeof(DT) * large_radix, para_ldst_num - 1);
-              __sync_move();
-
-              __bang_transpose(nram_para_store.r, nram_transpose_temp.r,
-                               para_ldst_num * 2, large_radix);
-            } else {
-              __bang_transpose(nram_para_store.r, nram_out_r, para_ldst_num,
-                               large_radix);
-              __bang_transpose(nram_para_store.i, nram_out_i, para_ldst_num,
-                               large_radix);
-            }
-
-          } else {
-            FFT_SWAP_PTR(nram_out_r, nram_in_r);
-            FFT_SWAP_PTR(nram_out_i, nram_in_i);
-            TRANSPOSE_XYZ2YXZ_PAIR(nram_out_r, nram_out_i, nram_in_r, nram_in_i,
-                                   small_section_num, para_ldst_num, radix, DT)
-            DT *nram_tw = _nram_tw;
-            value_mul = 8;
-
-            for (; small_stage_count > 1; small_stage_count--) {
-              FFT_SWAP_PTR(nram_out_r, nram_in_r);
-              FFT_SWAP_PTR(nram_out_i, nram_in_i);
-
-              radix = small_factors[value_mul++];
-              small_section_num = small_factors[value_mul++];
-              small_butterfly_num = small_factors[value_mul++];
-              small_in_stride = small_factors[value_mul++];
-
-              if (ld_dft_radix != radix) {
-                ld_dft_radix = radix;
-                for (int entry = 0;; entry++) {
-                  if (dft_table[entry].radix == ld_dft_radix) {
-                    align_K = K_num * ((radix + K_num - 1) / K_num);
-                    __memcpy_async(
-                        nram_dftmtx, &dft_matrix[dft_table[entry].offset * 2],
-                        sizeof(DT) * 2 * ld_dft_radix * align_K, SRAM2NRAM);
-                    __sync_move();
-                    break;
-                  }
-
-                  if (dft_table[entry].radix == -1) {
-                    break;
-                  }
-                }
-              }
-
-              if (sec_count == 0 && repeat_id == 1) {
-                __memcpy(nram_tw, small_twiddles,
-                         small_butterfly_num * (radix - 1) * sizeof(DT) * 2,
-                         SRAM2NRAM);
-                small_twiddles += small_butterfly_num * (radix - 1) * 2;
-              }
-
-              computeGenericButterflyOtherstagesMat(
-                  nram_out_r, nram_out_i, nram_in_r, nram_in_i, nram_scratch,
-                  nram_dftmtx, nram_tw, small_section_num, small_butterfly_num,
-                  para_ldst_num, small_in_stride, dir, radix);
-
-              nram_tw += small_butterfly_num * (radix - 1) * 2;
-            }
-
-            {
-              FFT_SWAP_PTR(nram_out_r, nram_in_r);
-              FFT_SWAP_PTR(nram_out_i, nram_in_i);
-
-              radix = small_factors[value_mul++];
-              small_section_num = small_factors[value_mul++];
-              small_butterfly_num = small_factors[value_mul++];
-              small_in_stride = small_factors[value_mul];
-
-              if (sec_count == 0 && repeat_id == 1) {
-                __memcpy_async(
-                    nram_tw, small_twiddles,
-                    small_butterfly_num * (radix - 1) * sizeof(DT) * 2,
-                    SRAM2NRAM);
-                __sync_move();
-              }
-
-              if (ld_dft_radix != radix) {
-                ld_dft_radix = radix;
-                for (int entry = 0;; entry++) {
-                  if (dft_table[entry].radix == ld_dft_radix) {
-                    align_K = K_num * ((radix + K_num - 1) / K_num);
-                    __memcpy_async(
-                        nram_dftmtx, &dft_matrix[dft_table[entry].offset * 2],
-                        sizeof(DT) * 2 * ld_dft_radix * align_K, SRAM2NRAM);
-                    __sync_move();
-                    break;
-                  }
-
-                  if (dft_table[entry].radix == -1) {
-                    break;
-                  }
-                }
-              }
-              computeGenericButterflyLaststageMat(
-                  nram_out_r, nram_out_i, nram_in_r, nram_in_i, nram_scratch,
-                  nram_dftmtx, nram_tw, small_section_num, small_butterfly_num,
-                  para_ldst_num, small_in_stride, dir, radix);
-
-              if (last_stage) {
-                __memcpy_async(nram_transpose_temp.r, nram_out_r,
-                               sizeof(DT) * large_radix, NRAM2NRAM,
-                               sizeof(DT) * large_radix * 2,
-                               sizeof(DT) * large_radix, para_ldst_num - 1);
-
-                __memcpy_async(nram_transpose_temp.i, nram_out_i,
-                               sizeof(DT) * large_radix, NRAM2NRAM,
-                               sizeof(DT) * large_radix * 2,
-                               sizeof(DT) * large_radix, para_ldst_num - 1);
-                __sync_move();
-
-                __bang_transpose(nram_para_store.r, nram_transpose_temp.r,
-                                 para_ldst_num * 2, large_radix);
-              } else {
-                __bang_transpose(nram_para_store.r, nram_out_r, para_ldst_num,
-                                 large_radix);
-                __bang_transpose(nram_para_store.i, nram_out_i, para_ldst_num,
-                                 large_radix);
-              }
-            }
-          }
-        }
-      }
-
-      __sync();
-    }
-    Fin_stride += large_butterfly_num;
-    Fout_stride += large_radix * large_butterfly_num;
-  }
-}
-
-template <typename DT>
-__mlu_func__ void computeLargeButterflyLaststage(
-    DT *output, DT *input, const int large_radix, const DT *cur_large_twiddles,
-    const DT *_twiddles, const DT *dft_matrix, const int large_section_num,
-    const int large_butterfly_num, const int large_in_stride, void *nram_buf,
-    const int *small_factors, const int nfft, const int dir) {
-  computeLargeButterflyOtherstages(
-      output, input, large_radix, cur_large_twiddles, _twiddles, dft_matrix,
-      large_section_num, large_butterfly_num, large_in_stride, nram_buf,
-      small_factors, nfft, dir, 1);
-}
-
 // Compute the large butterfly for the last stage of the FFT
 template <typename DT>
 __mlu_func__ void computeLargeButterflyOtherstagesBatchPingpong(
diff --git a/kernels/fft/fft_optm_device/fft_two-level_network_c2c_device.mlu b/kernels/fft/fft_optm_device/fft_two-level_network_c2c_device.mlu
index 55b3a37b6..808c91df1 100644
--- a/kernels/fft/fft_optm_device/fft_two-level_network_c2c_device.mlu
+++ b/kernels/fft/fft_optm_device/fft_two-level_network_c2c_device.mlu
@@ -35,26 +35,10 @@ __mlu_global__ void MLUKernelFFT1dButterflyRow(
     void *input, void *output, int *factors, void *twiddles, void *twiddles_end,
     void *dft_matrix, void *buffer, const int batch, const int fft_flag,
     const int direction, const int dtype_size) {
-  switch (dtype_size) {
-    case (MLUOP_DTYPE_COMPLEX_FLOAT):
-    case (MLUOP_DTYPE_FLOAT): {
-      computeMutiStageOnchip<float>((float *)input, (float *)output, factors,
-                                    (float *)twiddles, (float *)twiddles_end,
-                                    (float *)dft_matrix, (float *)buffer, batch,
-                                    fft_flag, direction);
-    }; break;
-    case (MLUOP_DTYPE_COMPLEX_HALF):
-    case (MLUOP_DTYPE_HALF): {
-      computeMutiStageOnchip<half>((half *)input, (half *)output, factors,
-                                   (half *)twiddles, (half *)twiddles_end,
-                                   (half *)dft_matrix, (half *)buffer, batch,
-                                   fft_flag, direction);
-    }; break;
-
-    default: {
-      MLULOG("mluOpFFT Not Implemented.");
-    }
-  }
+  computeMutiStageOnchip<float>((float *)input, (float *)output, factors,
+                                (float *)twiddles, (float *)twiddles_end,
+                                (float *)dft_matrix, (float *)buffer, batch,
+                                fft_flag, direction);
 }
 
 // Kernel function for 1D FFT butterfly operations on columns.
@@ -62,26 +46,10 @@ __mlu_global__ void MLUKernelFFT1dButterflyColumn(
     void *input, void *output, int *factors, void *twiddles, void *twiddles_end,
     void *dft_matrix, void *buffer, const int batch, const int fft_flag,
     const int direction, const int dtype_size, const int nb) {
-  switch (dtype_size) {
-    case (MLUOP_DTYPE_COMPLEX_FLOAT):
-    case (MLUOP_DTYPE_FLOAT): {
-      computeMutiStageOnchipColumn<float>(
-          (float *)input, (float *)output, factors, (float *)twiddles,
-          (float *)twiddles_end, (float *)dft_matrix, (float *)buffer, batch,
-          fft_flag, direction, nb);
-    }; break;
-    case (MLUOP_DTYPE_COMPLEX_HALF):
-    case (MLUOP_DTYPE_HALF): {
-      computeMutiStageOnchipColumn<half>((half *)input, (half *)output, factors,
-                                         (half *)twiddles, (half *)twiddles_end,
-                                         (half *)dft_matrix, (half *)buffer,
-                                         batch, fft_flag, direction, nb);
-    }; break;
-
-    default: {
-      MLULOG("mluOpFFT Not Implemented.");
-    }
-  }
+  computeMutiStageOnchipColumn<float>((float *)input, (float *)output, factors,
+                                      (float *)twiddles, (float *)twiddles_end,
+                                      (float *)dft_matrix, (float *)buffer,
+                                      batch, fft_flag, direction, nb);
 }
 
 // Launches a kernel for 2D FFT butterfly operations on columns.
diff --git a/kernels/fft/fft_optm_device/fft_two-level_network_c2r_device.mlu b/kernels/fft/fft_optm_device/fft_two-level_network_c2r_device.mlu
index 31b3c3908..a76078f62 100644
--- a/kernels/fft/fft_optm_device/fft_two-level_network_c2r_device.mlu
+++ b/kernels/fft/fft_optm_device/fft_two-level_network_c2r_device.mlu
@@ -33,26 +33,10 @@
 __mlu_global__ void MLUKernelFFT1dButterflyRowC2R(
     void *input, void *output, int *factors, void *twiddles, void *twiddles_end,
     void *dft_matrix, void *buffer, int batch, int fft_flag, int dtype_size) {
-  switch (dtype_size) {
-    case (MLUOP_DTYPE_COMPLEX_FLOAT):
-    case (MLUOP_DTYPE_FLOAT): {
-      computeMutiStageOnchipC2R<float>((float *)input, (float *)output, factors,
-                                       (float *)twiddles, (float *)twiddles_end,
-                                       (float *)dft_matrix, (float *)buffer,
-                                       batch, fft_flag);
-    }; break;
-    case (MLUOP_DTYPE_COMPLEX_HALF):
-    case (MLUOP_DTYPE_HALF): {
-      computeMutiStageOnchipC2R<half>((half *)input, (half *)output, factors,
-                                      (half *)twiddles, (half *)twiddles_end,
-                                      (half *)dft_matrix, (half *)buffer, batch,
-                                      fft_flag);
-    }; break;
-
-    default: {
-      MLULOG("mluOpFFT Not Implemented.");
-    }
-  }
+  computeMutiStageOnchipC2R<float>((float *)input, (float *)output, factors,
+                                   (float *)twiddles, (float *)twiddles_end,
+                                   (float *)dft_matrix, (float *)buffer, batch,
+                                   fft_flag);
 }
 
 mluOpStatus_t MLUOP_WIN_API kernelFFT1dButterflyRowC2R(
diff --git a/kernels/fft/fft_optm_device/fft_two-level_network_r2c_device.mlu b/kernels/fft/fft_optm_device/fft_two-level_network_r2c_device.mlu
index 5dd5f9e8d..3e36946b7 100644
--- a/kernels/fft/fft_optm_device/fft_two-level_network_r2c_device.mlu
+++ b/kernels/fft/fft_optm_device/fft_two-level_network_r2c_device.mlu
@@ -33,28 +33,10 @@
 __mlu_global__ void MLUKernelFFT1dButterflyR2C(
     void *input, void *output, int *factors, void *twiddles, void *twiddles_end,
     void *dft_matrix, void *buffer, int batch, int fft_flag, int dtype_size) {
-  switch (dtype_size) {
-    case (MLUOP_DTYPE_COMPLEX_FLOAT):
-    case (MLUOP_DTYPE_FLOAT): {
-      MLULOG("MLUOP_DTYPE_COMPLEX_FLOAT: MLUOP_DTYPE_FLOAT\n");
-      computeMutiStageR2COnchip<float>((float *)input, (float *)output, factors,
-                                       (float *)twiddles, (float *)twiddles_end,
-                                       (float *)dft_matrix, (float *)buffer,
-                                       batch, fft_flag);
-    }; break;
-    case (MLUOP_DTYPE_COMPLEX_HALF):
-    case (MLUOP_DTYPE_HALF): {
-      MLULOG("MLUOP_DTYPE_COMPLEX_HALF: MLUOP_DTYPE_HALF\n");
-      computeMutiStageR2COnchip<half>((half *)input, (half *)output, factors,
-                                      (half *)twiddles, (half *)twiddles_end,
-                                      (half *)dft_matrix, (half *)buffer, batch,
-                                      fft_flag);
-    }; break;
-
-    default: {
-      MLULOG("mluOpFFT Not Implemented.");
-    }
-  }
+  computeMutiStageR2COnchip<float>((float *)input, (float *)output, factors,
+                                   (float *)twiddles, (float *)twiddles_end,
+                                   (float *)dft_matrix, (float *)buffer, batch,
+                                   fft_flag);
 }
 
 mluOpStatus_t MLUOP_WIN_API kernelFFT1dButterflyR2C(cnrtDim3_t k_dim,
diff --git a/kernels/fft/irfft/irfft_host.cpp b/kernels/fft/irfft/irfft_host.cpp
index b065028e1..5d2543ad7 100644
--- a/kernels/fft/irfft/irfft_host.cpp
+++ b/kernels/fft/irfft/irfft_host.cpp
@@ -795,14 +795,14 @@ static mluOpStatus_t makeIRFFT1dContiguousInput(mluOpHandle_t handle,
     status = mluOpCreateTensorDescriptor(&input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 2;
-    int64_t dims[in_dim_num] = {
+    const int IN_DIM_NUM = 2;
+    int64_t dims[IN_DIM_NUM] = {
         fft_plan->batch, fft_plan->prime ? fft_plan->inembed[0]
                                          : std::min(fft_plan->inembed[0],
                                                     FFT_HALF(fft_plan->n[0]))};
-    int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride};
+    int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride};
     status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                           fft_plan->input_dtype, in_dim_num,
+                                           fft_plan->input_dtype, IN_DIM_NUM,
                                            dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -840,15 +840,15 @@ static mluOpStatus_t padIRFFT1dContiguousInput(mluOpHandle_t handle,
     status = mluOpCreateTensorDescriptor(&padded_input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 2;
-    int64_t dims[in_dim_num] = {batch, fft_plan->inembed[0] * COMPLEX};
+    const int IN_DIM_NUM = 2;
+    int64_t dims[IN_DIM_NUM] = {batch, fft_plan->inembed[0] * COMPLEX};
     status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, dims);
+                                         in_r_dtype, IN_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    int64_t padded_dims[in_dim_num] = {batch, FFT_HALF(n) * COMPLEX};
+    int64_t padded_dims[IN_DIM_NUM] = {batch, FFT_HALF(n) * COMPLEX};
     status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, padded_dims);
+                                         in_r_dtype, IN_DIM_NUM, padded_dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     const int pad_dim_num = 4;
@@ -908,17 +908,17 @@ static mluOpStatus_t padIRFFT2dContiguousInput(mluOpHandle_t handle,
     status = mluOpCreateTensorDescriptor(&padded_input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 3;
-    int64_t dims[in_dim_num] = {
+    const int IN_DIM_NUM = 3;
+    int64_t dims[IN_DIM_NUM] = {
         batch, std::min(fft_plan->inembed[0], n0),
         std::min(fft_plan->inembed[1], FFT_HALF(n1)) * COMPLEX};
     status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, dims);
+                                         in_r_dtype, IN_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    int64_t padded_dims[in_dim_num] = {batch, n0, FFT_HALF(n1) * COMPLEX};
+    int64_t padded_dims[IN_DIM_NUM] = {batch, n0, FFT_HALF(n1) * COMPLEX};
     status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, padded_dims);
+                                         in_r_dtype, IN_DIM_NUM, padded_dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     const int pad_dim_num = 6;
@@ -1461,17 +1461,17 @@ static mluOpStatus_t makeIRFFT1dContiguousOutput(mluOpHandle_t handle,
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // set up tensor desc
-    const int out_dim_num = 2;
-    int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->prime
+    const int OUT_DIM_NUM = 2;
+    int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->prime
                                                       ? fft_plan->onembed[0]
                                                       : fft_plan->n[0]};
-    int64_t strides[out_dim_num] = {fft_plan->odist, fft_plan->ostride};
+    int64_t strides[OUT_DIM_NUM] = {fft_plan->odist, fft_plan->ostride};
     status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
-                                         out_r_dtype, out_dim_num, dims);
+                                         out_r_dtype, OUT_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     status =
         mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY,
-                                      out_r_dtype, out_dim_num, dims, strides);
+                                      out_r_dtype, OUT_DIM_NUM, dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // copy
@@ -1486,17 +1486,8 @@ static mluOpStatus_t makeIRFFT1dContiguousOutput(mluOpHandle_t handle,
     DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
                                                  cnnl_copy_dst_desc);
 
-    size_t workspace_size = 0;
-    CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
-                                       cnnl_copy_dst_desc, &workspace_size));
-
-    void *workspace = nullptr;
-    if (workspace_size > 0) {
-      CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
-    }
     CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
-                          cnnl_copy_dst_desc, output, workspace,
-                          workspace_size));
+                          cnnl_copy_dst_desc, output, NULL, 0));
 
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
@@ -1567,12 +1558,16 @@ mluOpStatus_t execIRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
       const float beta = 0.0;
       mluOpTensorDescriptor_t c_desc = nullptr;
       status = mluOpCreateTensorDescriptor(&c_desc);
-      const int out_dim_num = 2;
-      int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0]};
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+      const int OUT_DIM_NUM = 2;
+      int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0]};
       status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
-                                           fft_plan->output_dtype, 2, dims);
+                                           fft_plan->output_dtype, OUT_DIM_NUM,
+                                           dims);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
       status = mluOpSetTensorDescriptorOnchipDataType(
           c_desc, fft_plan->execution_dtype);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
       DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
                                         cnnl_handle);  // convert to cnnl_handle
@@ -1583,6 +1578,8 @@ mluOpStatus_t execIRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
                                  cnnl_output_desc, fft_plan->mlu_addrs.output,
                                  &beta, cnnl_output_desc,
                                  fft_plan->mlu_addrs.output));
+      status = mluOpDestroyTensorDescriptor(c_desc);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
       DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
       DESTROY_CNNL_HANDLE(cnnl_handle);
     }
@@ -1606,13 +1603,13 @@ static void configureIRFFT2dWorkspaceAddrs(mluOpHandle_t handle,
   size_t out_c_dtype_size = mluOpDataTypeBytes(out_c_dtype);
 
   int batch = fft_plan->batch;
-  int _n0 = fft_plan->n[0];
-  int _n1 = fft_plan->n[1];
+  int n0_ori = fft_plan->n[0];
+  int n1_ori = fft_plan->n[1];
 
   size_t offset = 0;
   if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
     // rr ri ir ii
-    size_t buffer_size = batch * in_c_dtype_size * _n0 * _n1 * 2;
+    size_t buffer_size = batch * in_c_dtype_size * n0_ori * n1_ori * 2;
     offset = 0;
     fft_plan->mlu_addrs.input = input;
     fft_plan->mlu_addrs.output = output;
@@ -1625,25 +1622,28 @@ static void configureIRFFT2dWorkspaceAddrs(mluOpHandle_t handle,
   if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
     offset = 0;
     fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset;
-    offset += batch * in_c_dtype_size * _n0 * _n1 * 2;
+    offset += batch * in_c_dtype_size * n0_ori * n1_ori * 2;
 
-    if (fft_plan->is_input_contiguous) {
+    if (fft_plan->is_input_contiguous &&
+            fft_plan->inembed[0] <= fft_plan->n[0] &&
+            fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1 ||
+        fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
       fft_plan->mlu_addrs.input = input;
     } else {
       fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset;
-      offset += batch * in_c_dtype_size * _n0 * _n1;
+      offset += batch * in_c_dtype_size * n0_ori * n1_ori;
     }
 
     if (fft_plan->is_output_contiguous) {
       fft_plan->mlu_addrs.output = output;
     } else {
       fft_plan->mlu_addrs.output = (uint8_t *)workspace + offset;
-      offset += batch * in_c_dtype_size * _n0 * _n1;
+      offset += batch * in_c_dtype_size * n0_ori * n1_ori;
     }
   }
 
   if (fft_plan->n[0] > fft_plan->inembed[0] ||
-      fft_plan->n[1] > fft_plan->inembed[1]) {
+      fft_plan->n[1] / 2 + 1 > fft_plan->inembed[1]) {
     fft_plan->mlu_addrs.input_pad_addr = (uint8_t *)workspace + offset;
   }
 }
@@ -1828,11 +1828,11 @@ mluOpStatus_t computeFFT2dMatMulRowC2R(mluOpHandle_t handle,
   int requested_algo_count = 1, return_algo_count = 0;
   float *workspace;
   size_t workspace_size;
-  cnnlGetBatchMatMulAlgoHeuristic(
+  cnnlGetBatchMatMulExAlgoHeuristic(
       cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
       requested_algo_count, &heuristic_result, &return_algo_count);
 
-  cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
+  cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
 
   if (workspace_size > 0) {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
@@ -1840,10 +1840,10 @@ mluOpStatus_t computeFFT2dMatMulRowC2R(mluOpHandle_t handle,
     CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
   }
 
-  CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
-                                    cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
-                                    in_addr, &beta, cnnl_c_desc, out_addr,
-                                    (void *)workspace, workspace_size));
+  CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
+                              cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
+                              in_addr, &beta, cnnl_c_desc, out_addr,
+                              (void *)workspace, workspace_size));
   // destroy cnnl descriptor
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
@@ -1866,23 +1866,23 @@ static mluOpStatus_t makeIRFFT2dContiguousInput(mluOpHandle_t handle,
   auto status = MLUOP_STATUS_SUCCESS;
   if ((!fft_plan->is_input_contiguous ||
        (fft_plan->inembed[0] > fft_plan->n[0] ||
-        fft_plan->inembed[1] > fft_plan->n[1] / 2 + 1) &&
-           !fft_plan->prime) &&
+        fft_plan->inembed[1] > fft_plan->n[1] / 2 + 1)) &&
       fft_plan->fft_strategy != CNFFT_FUNC_MANY_DIST1_2D) {
     VLOG(5) << "launch mluOpContiguous for irfft2d input";
     mluOpTensorDescriptor_t input_desc;
     status = mluOpCreateTensorDescriptor(&input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 3;
-    int64_t dims[in_dim_num] = {
+    const int IN_DIM_NUM = 3;
+    int64_t dims[IN_DIM_NUM] = {
         fft_plan->batch, std::min(fft_plan->inembed[0], fft_plan->n[0]),
         std::min(FFT_HALF(fft_plan->n[1]), fft_plan->inembed[1])};
-    int64_t strides[in_dim_num] = {fft_plan->idist,
-                                   (fft_plan->istride * fft_plan->inembed[1]),
-                                   fft_plan->istride};
+    int64_t strides[IN_DIM_NUM];  // IN_DIM_NUM
+    for (int i = 0; i < IN_DIM_NUM; i++) {
+      strides[i] = fft_plan->in_stride[i];
+    }
     status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                           fft_plan->input_dtype, in_dim_num,
+                                           fft_plan->input_dtype, IN_DIM_NUM,
                                            dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -1913,18 +1913,19 @@ static mluOpStatus_t makeIRFFT2dContiguousOutput(mluOpHandle_t handle,
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // set up tensor desc
-    const int out_dim_num = 3;
-    int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
+    const int OUT_DIM_NUM = 3;
+    int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0],
                                  fft_plan->n[1]};
-    int64_t strides[out_dim_num] = {fft_plan->odist,
-                                    fft_plan->ostride * fft_plan->onembed[1],
-                                    fft_plan->ostride};
+    int64_t strides[OUT_DIM_NUM];  // OUT_DIM_NUM
+    for (int i = 0; i < OUT_DIM_NUM; i++) {
+      strides[i] = fft_plan->out_stride[i];
+    }
     status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
-                                         out_c_dtype, out_dim_num, dims);
+                                         out_c_dtype, OUT_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     status =
         mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY,
-                                      out_c_dtype, out_dim_num, dims, strides);
+                                      out_c_dtype, OUT_DIM_NUM, dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // void *copy_src_addr = fft_plan->matmul_addrs.output_contiguous_addr;
@@ -1937,17 +1938,8 @@ static mluOpStatus_t makeIRFFT2dContiguousOutput(mluOpHandle_t handle,
     DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
                                                  cnnl_copy_dst_desc);
 
-    size_t workspace_size = 0;
-    CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
-                                       cnnl_copy_dst_desc, &workspace_size));
-
-    void *workspace = nullptr;
-    if (workspace_size > 0) {
-      CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
-    }
     CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
-                          cnnl_copy_dst_desc, output, workspace,
-                          workspace_size));
+                          cnnl_copy_dst_desc, output, NULL, 0));
 
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
@@ -1987,63 +1979,92 @@ mluOpStatus_t execIRFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
       fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr;
     }
 
-    for (int batch_id = 0; batch_id < fft_plan->batch; batch_id++) {
-      status = kernelIRFFT2dButterflyColumn(k_dim, k_type, handle->queue,
-                                            fft_plan, FFT_IFFT);
+    if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) {
+      mluOpTensorDescriptor_t input_desc;
+      status = mluOpCreateTensorDescriptor(&input_desc);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
+      const int IN_DIM_NUM = 2;
+      int64_t dims[IN_DIM_NUM] = {
+          fft_plan->batch * fft_plan->n[0] * fft_plan->n[1], 1};
+      int64_t strides[IN_DIM_NUM] = {2, 1};
+      status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
+                                             MLUOP_DTYPE_FLOAT, IN_DIM_NUM,
+                                             dims, strides);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
-      status = kernelIRFFT2dButterflyRow(k_dim, k_type, handle->queue, fft_plan,
-                                         FFT_IFFT);
+
+      status = mluOpContiguous(handle, input_desc, fft_plan->mlu_addrs.input,
+                               fft_plan->mlu_addrs.output);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
+      status = mluOpDestroyTensorDescriptor(input_desc);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    } else {
+      for (int batch_id = 0; batch_id < fft_plan->batch; batch_id++) {
+        status = kernelIRFFT2dButterflyColumn(k_dim, k_type, handle->queue,
+                                              fft_plan, FFT_IFFT);
+
+        INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+        status = kernelIRFFT2dButterflyRow(k_dim, k_type, handle->queue,
+                                           fft_plan, FFT_IFFT);
+        INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+
+        fft_plan->mlu_addrs.input =
+            (void *)((uint64_t)(fft_plan->mlu_addrs.input) + idist);
+        fft_plan->mlu_addrs.output =
+            (void *)((uint64_t)(fft_plan->mlu_addrs.output) + odist);
+      }
       fft_plan->mlu_addrs.input =
-          (void *)((uint64_t)(fft_plan->mlu_addrs.input) + idist);
+          (void *)((uint64_t)(fft_plan->mlu_addrs.input) -
+                   fft_plan->batch * idist);
       fft_plan->mlu_addrs.output =
-          (void *)((uint64_t)(fft_plan->mlu_addrs.output) + odist);
+          (void *)((uint64_t)(fft_plan->mlu_addrs.output) -
+                   fft_plan->batch * odist);
     }
-    fft_plan->mlu_addrs.input = (void *)((uint64_t)(fft_plan->mlu_addrs.input) -
-                                         fft_plan->batch * idist);
-    fft_plan->mlu_addrs.output =
-        (void *)((uint64_t)(fft_plan->mlu_addrs.output) -
-                 fft_plan->batch * odist);
+  } else if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
+    status = computeFFT2dMatMulColumnC2R(handle, fft_plan, scale_factor);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    if (scale_factor != 1.0) {
-      const float alpha[2] = {scale_factor, 0.0};
-      const float beta[2] = {0.0, 0.0};
-      mluOpTensorDescriptor_t c_desc = nullptr;
-      status = mluOpCreateTensorDescriptor(&c_desc);
-      const int out_dim_num = 3;
-      int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
-                                   fft_plan->n[1]};
-      status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
-                                           fft_plan->output_dtype, 3, dims);
-      status = mluOpSetTensorDescriptorOnchipDataType(
-          c_desc, fft_plan->execution_dtype);
+    status = computeFFT2dMatMulRowC2R(handle, fft_plan, scale_factor);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+  }
 
-      DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
-                                        cnnl_handle);  // convert to cnnl_handle
+  if (scale_factor != 1.0) {
+    const float alpha[2] = {scale_factor, 0.0};
+    const float beta[2] = {0.0, 0.0};
+    mluOpTensorDescriptor_t c_desc = nullptr;
+    status = mluOpCreateTensorDescriptor(&c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    const int OUT_DIM_NUM = 3;
+    int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0],
+                                 fft_plan->n[1]};
+    status = mluOpSetTensorDescriptor_v2(
+        c_desc, MLUOP_LAYOUT_ARRAY, fft_plan->output_dtype, OUT_DIM_NUM, dims);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    status = mluOpSetTensorDescriptorOnchipDataType(c_desc,
+                                                    fft_plan->execution_dtype);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc);
+    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
+                                      cnnl_handle);  // convert to cnnl_handle
 
-      CALL_CNNL(cnnlTransform_v2(cnnl_handle, CNNL_POINTER_MODE_HOST, &alpha,
-                                 cnnl_output_desc, fft_plan->mlu_addrs.output,
-                                 &beta, cnnl_output_desc,
-                                 fft_plan->mlu_addrs.output));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
-      DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
+    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc);
+
+    CALL_CNNL(cnnlTransform_v2(cnnl_handle, CNNL_POINTER_MODE_HOST, &alpha,
+                               cnnl_output_desc, fft_plan->mlu_addrs.output,
+                               &beta, cnnl_output_desc,
+                               fft_plan->mlu_addrs.output));
+    status = mluOpDestroyTensorDescriptor(c_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
+    DESTROY_CNNL_HANDLE(cnnl_handle);
+  }
+  INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
+  if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
     status = makeIRFFT2dContiguousOutput(handle, fft_plan, output,
                                          fft_plan->mlu_addrs.output);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
-
-  } else if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
-    status = computeFFT2dMatMulColumnC2R(handle, fft_plan, scale_factor);
-    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
-
-    status = computeFFT2dMatMulRowC2R(handle, fft_plan, scale_factor);
-    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
   }
   return status;
 }
diff --git a/kernels/fft/rfft/rfft_host.cpp b/kernels/fft/rfft/rfft_host.cpp
index d0755e8be..9f9c37030 100644
--- a/kernels/fft/rfft/rfft_host.cpp
+++ b/kernels/fft/rfft/rfft_host.cpp
@@ -434,13 +434,13 @@ static void configureRFFT2dWorkspaceAddrs(mluOpHandle_t handle,
   size_t out_c_dtype_size = mluOpDataTypeBytes(out_c_dtype);
 
   int batch = fft_plan->batch;
-  int _n0 = fft_plan->n[0];
-  int _n1 = fft_plan->n[1];
+  int n0_ori = fft_plan->n[0];
+  int n1_ori = fft_plan->n[1];
 
   size_t offset = 0;
   if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
     // rr ri ir ii
-    size_t buffer_size = batch * out_c_dtype_size * _n0 * _n1 * 2;
+    size_t buffer_size = batch * out_c_dtype_size * n0_ori * n1_ori * 2;
     fft_plan->mlu_addrs.input = input;
     fft_plan->mlu_addrs.output = output;
     fft_plan->mlu_addrs.buffer_in = (uint8_t *)workspace + offset;
@@ -451,20 +451,22 @@ static void configureRFFT2dWorkspaceAddrs(mluOpHandle_t handle,
 
   if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
     fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset;
-    offset += batch * out_c_dtype_size * _n0 * _n1 * 2;
+    offset += batch * out_c_dtype_size * n0_ori * n1_ori * 2;
 
-    if (fft_plan->is_input_contiguous) {
+    if ((fft_plan->is_input_contiguous &&
+         fft_plan->inembed[0] <= fft_plan->n[0] &&
+         fft_plan->inembed[1] <= fft_plan->n[1])) {
       fft_plan->mlu_addrs.input = input;
     } else {
       fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset;
-      offset += batch * out_c_dtype_size * _n0 * _n1;
+      offset += batch * out_c_dtype_size * n0_ori * n1_ori;
     }
 
     if (fft_plan->is_output_contiguous) {
       fft_plan->mlu_addrs.output = output;
     } else {
       fft_plan->mlu_addrs.output = (uint8_t *)workspace + offset;
-      offset += batch * out_c_dtype_size * _n0 * _n1;
+      offset += batch * out_c_dtype_size * n0_ori * n1_ori;
     }
   }
 
@@ -707,12 +709,12 @@ static mluOpStatus_t makeRFFT1dContiguousInput(mluOpHandle_t handle,
     status = mluOpCreateTensorDescriptor(&input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 2;
+    const int IN_DIM_NUM = 2;
     if (fft_plan->prime) {
-      int64_t dims[in_dim_num] = {fft_plan->batch, fft_plan->inembed[0]};
-      int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride};
+      int64_t dims[IN_DIM_NUM] = {fft_plan->batch, fft_plan->inembed[0]};
+      int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride};
       status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                             fft_plan->input_dtype, in_dim_num,
+                                             fft_plan->input_dtype, IN_DIM_NUM,
                                              dims, strides);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -724,11 +726,11 @@ static mluOpStatus_t makeRFFT1dContiguousInput(mluOpHandle_t handle,
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     } else {
-      int64_t dims[in_dim_num] = {
+      int64_t dims[IN_DIM_NUM] = {
           fft_plan->batch, std::min(fft_plan->inembed[0], fft_plan->n[0])};
-      int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride};
+      int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride};
       status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                             fft_plan->input_dtype, in_dim_num,
+                                             fft_plan->input_dtype, IN_DIM_NUM,
                                              dims, strides);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -762,15 +764,15 @@ static mluOpStatus_t padRFFT1dContiguousInput(mluOpHandle_t handle,
     status = mluOpCreateTensorDescriptor(&padded_input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 2;
-    int64_t dims[in_dim_num] = {batch, fft_plan->inembed[0]};
+    const int IN_DIM_NUM = 2;
+    int64_t dims[IN_DIM_NUM] = {batch, fft_plan->inembed[0]};
     status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, dims);
+                                         in_r_dtype, IN_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    int64_t padded_dims[in_dim_num] = {batch, n};
+    int64_t padded_dims[IN_DIM_NUM] = {batch, n};
     status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, padded_dims);
+                                         in_r_dtype, IN_DIM_NUM, padded_dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     const int pad_dim_num = 4;
@@ -818,16 +820,16 @@ static mluOpStatus_t padRFFT2dContiguousInput(mluOpHandle_t handle,
     status = mluOpCreateTensorDescriptor(&padded_input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 3;
-    int64_t dims[in_dim_num] = {batch, std::min(n0, fft_plan->inembed[0]),
+    const int IN_DIM_NUM = 3;
+    int64_t dims[IN_DIM_NUM] = {batch, std::min(n0, fft_plan->inembed[0]),
                                 std::min(n1, fft_plan->inembed[1])};
     status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, dims);
+                                         in_r_dtype, IN_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    int64_t padded_dims[in_dim_num] = {batch, n0, n1};
+    int64_t padded_dims[IN_DIM_NUM] = {batch, n0, n1};
     status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY,
-                                         in_r_dtype, in_dim_num, padded_dims);
+                                         in_r_dtype, IN_DIM_NUM, padded_dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     const int pad_dim_num = 6;
@@ -1083,17 +1085,17 @@ static mluOpStatus_t makeRFFT1dContiguousOutput(mluOpHandle_t handle,
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // set up tensor desc
-    const int out_dim_num = 2;
-    int64_t dims[out_dim_num] = {
+    const int OUT_DIM_NUM = 2;
+    int64_t dims[OUT_DIM_NUM] = {
         fft_plan->batch,
         (fft_plan->prime) ? fft_plan->onembed[0] : (fft_plan->n[0] / 2 + 1)};
-    int64_t strides[out_dim_num] = {fft_plan->odist, fft_plan->ostride};
+    int64_t strides[OUT_DIM_NUM] = {fft_plan->odist, fft_plan->ostride};
     status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
-                                         out_c_dtype, out_dim_num, dims);
+                                         out_c_dtype, OUT_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     status =
         mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY,
-                                      out_c_dtype, out_dim_num, dims, strides);
+                                      out_c_dtype, OUT_DIM_NUM, dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // copy
@@ -1109,17 +1111,8 @@ static mluOpStatus_t makeRFFT1dContiguousOutput(mluOpHandle_t handle,
     DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
                                                  cnnl_copy_dst_desc);
 
-    size_t workspace_size = 0;
-    CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
-                                       cnnl_copy_dst_desc, &workspace_size));
-
-    void *workspace = nullptr;
-    if (workspace_size > 0) {
-      CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
-    }
     CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
-                          cnnl_copy_dst_desc, output, workspace,
-                          workspace_size));
+                          cnnl_copy_dst_desc, output, NULL, 0));
 
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
@@ -1138,26 +1131,26 @@ static mluOpStatus_t makeRFFT2dContiguousInput(mluOpHandle_t handle,
   auto status = MLUOP_STATUS_SUCCESS;
   if ((!fft_plan->is_input_contiguous ||
        (fft_plan->inembed[0] > fft_plan->n[0] ||
-        fft_plan->inembed[1] > fft_plan->n[1]) &&
-           !fft_plan->prime) &&
+        fft_plan->inembed[1] > fft_plan->n[1])) &&
       fft_plan->fft_strategy != CNFFT_FUNC_MANY_DIST1_2D) {
     VLOG(5) << "launch mluOpContiguous for rfft2d input";
     mluOpTensorDescriptor_t input_desc;
     status = mluOpCreateTensorDescriptor(&input_desc);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    const int in_dim_num = 3;
-    int64_t dims[in_dim_num] = {
+    const int IN_DIM_NUM = 3;
+    int64_t dims[IN_DIM_NUM] = {
         fft_plan->batch,
         fft_plan->n[0] > fft_plan->inembed[0] ? fft_plan->inembed[0]
                                               : fft_plan->n[0],
         fft_plan->n[1] > fft_plan->inembed[1] ? fft_plan->inembed[1]
                                               : fft_plan->n[1]};
-    int64_t strides[in_dim_num] = {fft_plan->idist,
-                                   (fft_plan->istride * fft_plan->inembed[1]),
-                                   fft_plan->istride};
+    int64_t strides[IN_DIM_NUM];  // IN_DIM_NUM
+    for (int i = 0; i < IN_DIM_NUM; i++) {
+      strides[i] = fft_plan->in_stride[i];
+    }
     status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
-                                           fft_plan->input_dtype, in_dim_num,
+                                           fft_plan->input_dtype, IN_DIM_NUM,
                                            dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -1188,18 +1181,19 @@ static mluOpStatus_t makeRFFT2dContiguousOutput(mluOpHandle_t handle,
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     // set up tensor desc
-    const int out_dim_num = 3;
-    int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
+    const int OUT_DIM_NUM = 3;
+    int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0],
                                  fft_plan->n[1] / 2 + 1};
-    int64_t strides[out_dim_num] = {fft_plan->odist,
-                                    fft_plan->ostride * fft_plan->onembed[1],
-                                    fft_plan->ostride};
+    int64_t strides[OUT_DIM_NUM];  // OUT_DIM_NUM
+    for (int i = 0; i < OUT_DIM_NUM; i++) {
+      strides[i] = fft_plan->out_stride[i];
+    }
     status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
-                                         out_c_dtype, out_dim_num, dims);
+                                         out_c_dtype, OUT_DIM_NUM, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     status =
         mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY,
-                                      out_c_dtype, out_dim_num, dims, strides);
+                                      out_c_dtype, OUT_DIM_NUM, dims, strides);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     void *copy_src_addr = fft_plan->mlu_addrs.output;
@@ -1211,17 +1205,8 @@ static mluOpStatus_t makeRFFT2dContiguousOutput(mluOpHandle_t handle,
     DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
                                                  cnnl_copy_dst_desc);
 
-    size_t workspace_size = 0;
-    CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
-                                       cnnl_copy_dst_desc, &workspace_size));
-
-    void *workspace = nullptr;
-    if (workspace_size > 0) {
-      CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
-    }
     CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
-                          cnnl_copy_dst_desc, output, workspace,
-                          workspace_size));
+                          cnnl_copy_dst_desc, output, NULL, 0));
 
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
@@ -1287,12 +1272,16 @@ mluOpStatus_t execRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
       const float beta[2] = {0.0, 0.0};
       mluOpTensorDescriptor_t c_desc = nullptr;
       status = mluOpCreateTensorDescriptor(&c_desc);
-      const int out_dim_num = 2;
-      int64_t dims[out_dim_num] = {fft_plan->batch, (fft_plan->n[0] / 2 + 1)};
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+      const int OUT_DIM_NUM = 2;
+      int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, (fft_plan->n[0] / 2 + 1)};
       status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
-                                           fft_plan->output_dtype, 2, dims);
+                                           fft_plan->output_dtype, OUT_DIM_NUM,
+                                           dims);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
       status = mluOpSetTensorDescriptorOnchipDataType(
           c_desc, fft_plan->execution_dtype);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
       DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
                                         cnnl_handle);  // convert to cnnl_handle
@@ -1303,6 +1292,8 @@ mluOpStatus_t execRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
                                  cnnl_output_desc, fft_plan->mlu_addrs.output,
                                  &beta, cnnl_output_desc,
                                  fft_plan->mlu_addrs.output));
+      status = mluOpDestroyTensorDescriptor(c_desc);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
       DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
       DESTROY_CNNL_HANDLE(cnnl_handle);
     }
@@ -1511,21 +1502,21 @@ mluOpStatus_t computeFFT2dMatMulRowR2C(mluOpHandle_t handle,
   int requested_algo_count = 1, return_algo_count = 0;
   float *workspace;
   size_t workspace_size;
-  cnnlGetBatchMatMulAlgoHeuristic(
+  cnnlGetBatchMatMulExAlgoHeuristic(
       cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
       requested_algo_count, &heuristic_result, &return_algo_count);
 
-  cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
+  cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
 
   if (workspace_size > 0) {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
   } else {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
   }
-  CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
-                                    cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
-                                    in_addr, &beta, cnnl_c_desc, out_addr,
-                                    (void *)workspace, workspace_size));
+  CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
+                              cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
+                              in_addr, &beta, cnnl_c_desc, out_addr,
+                              (void *)workspace, workspace_size));
   // destroy cnnl descriptor
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
@@ -1561,70 +1552,109 @@ mluOpStatus_t execRFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
     status = makeRFFT2dContiguousInput(handle, fft_plan, input);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-    if (fft_plan->n[0] > fft_plan->inembed[0] ||
-        fft_plan->n[1] > fft_plan->inembed[1]) {
-      status = padRFFT2dContiguousInput(handle, fft_plan);
+    if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) {
+      mluOpTensorDescriptor_t input_desc, padded_output_desc;
+      status = mluOpCreateTensorDescriptor(&input_desc);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
-
-      fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr;
-    }
-
-    for (int batch_id = 0; batch_id < fft_plan->batch; batch_id++) {
-      status = kernelRFFT2dButterflyRow(k_dim, k_type, handle->queue, fft_plan,
-                                        RFFT);
-
+      status = mluOpCreateTensorDescriptor(&padded_output_desc);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-      status = kernelRFFT2dButterflyColumn(k_dim, k_type, handle->queue,
-                                           fft_plan, FFT_IFFT);
+      const int IN_DIM_NUM = 2;
+      int64_t dims[IN_DIM_NUM] = {fft_plan->batch,
+                                  fft_plan->n[0] * fft_plan->n[1]};
+      status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY,
+                                           MLUOP_DTYPE_FLOAT, IN_DIM_NUM, dims);
       INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
-      fft_plan->mlu_addrs.input =
-          (void *)((uint64_t)(fft_plan->mlu_addrs.input) + idist);
-      fft_plan->mlu_addrs.output =
-          (void *)((uint64_t)(fft_plan->mlu_addrs.output) + odist);
-    }
-    fft_plan->mlu_addrs.input = (void *)((uint64_t)(fft_plan->mlu_addrs.input) -
-                                         fft_plan->batch * idist);
-    fft_plan->mlu_addrs.output =
-        (void *)((uint64_t)(fft_plan->mlu_addrs.output) -
-                 fft_plan->batch * odist);
-
-    if (scale_factor != 1.0) {
-      const float alpha[2] = {scale_factor, 0.0};
-      const float beta[2] = {0.0, 0.0};
-      mluOpTensorDescriptor_t c_desc = nullptr;
-      status = mluOpCreateTensorDescriptor(&c_desc);
-      const int out_dim_num = 3;
-      int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
-                                   fft_plan->n[1] / 2 + 1};
-      status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
-                                           fft_plan->output_dtype, 3, dims);
-      status = mluOpSetTensorDescriptorOnchipDataType(
-          c_desc, fft_plan->execution_dtype);
+      int64_t padded_dims[IN_DIM_NUM] = {fft_plan->batch,
+                                         fft_plan->n[0] * fft_plan->n[1] * 2};
+      status = mluOpSetTensorDescriptor_v2(
+          padded_output_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_FLOAT, IN_DIM_NUM,
+          padded_dims);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
+      const int pad_dim_num = 4;
+      int paddings[pad_dim_num] = {0, 0, 0, 1};
+      uint64_t padding_value = 0x00000000;
       DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
                                         cnnl_handle);  // convert to cnnl_handle
 
-      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc);
+      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
+      DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(padded_output_desc,
+                                                   cnnl_padded_output_desc);
+      CALL_CNNL(cnnlPad(cnnl_handle, cnnl_input_desc, fft_plan->mlu_addrs.input,
+                        paddings, &padding_value, cnnl_padded_output_desc,
+                        fft_plan->mlu_addrs.output));
+
+      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
+      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_padded_output_desc);
 
-      CALL_CNNL(cnnlTransform_v2(cnnl_handle, CNNL_POINTER_MODE_HOST, &alpha,
-                                 cnnl_output_desc, fft_plan->mlu_addrs.output,
-                                 &beta, cnnl_output_desc,
-                                 fft_plan->mlu_addrs.output));
-      DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
       DESTROY_CNNL_HANDLE(cnnl_handle);
-    }
-    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    } else {
+      for (int batch_id = 0; batch_id < fft_plan->batch; batch_id++) {
+        status = kernelRFFT2dButterflyRow(k_dim, k_type, handle->queue,
+                                          fft_plan, RFFT);
 
-    status = makeRFFT2dContiguousOutput(handle, fft_plan, output);
-    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+        INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
+        status = kernelRFFT2dButterflyColumn(k_dim, k_type, handle->queue,
+                                             fft_plan, FFT_IFFT);
+        INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+
+        fft_plan->mlu_addrs.input =
+            (void *)((uint64_t)(fft_plan->mlu_addrs.input) + idist);
+        fft_plan->mlu_addrs.output =
+            (void *)((uint64_t)(fft_plan->mlu_addrs.output) + odist);
+      }
+      fft_plan->mlu_addrs.input =
+          (void *)((uint64_t)(fft_plan->mlu_addrs.input) -
+                   fft_plan->batch * idist);
+      fft_plan->mlu_addrs.output =
+          (void *)((uint64_t)(fft_plan->mlu_addrs.output) -
+                   fft_plan->batch * odist);
+    }
   } else if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
     status = computeFFT2dMatMulRowR2C(handle, fft_plan, scale_factor);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     status = computeFFT2dMatMulColumnR2C(handle, fft_plan, scale_factor);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
   }
+
+  if (scale_factor != 1.0) {
+    const float alpha[2] = {scale_factor, 0.0};
+    const float beta[2] = {0.0, 0.0};
+    mluOpTensorDescriptor_t c_desc = nullptr;
+    status = mluOpCreateTensorDescriptor(&c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    const int OUT_DIM_NUM = 3;
+    int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0],
+                                 fft_plan->n[1] / 2 + 1};
+    status = mluOpSetTensorDescriptor_v2(
+        c_desc, MLUOP_LAYOUT_ARRAY, fft_plan->output_dtype, OUT_DIM_NUM, dims);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    status = mluOpSetTensorDescriptorOnchipDataType(c_desc,
+                                                    fft_plan->execution_dtype);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+
+    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
+                                      cnnl_handle);  // convert to cnnl_handle
+
+    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc);
+
+    CALL_CNNL(cnnlTransform_v2(cnnl_handle, CNNL_POINTER_MODE_HOST, &alpha,
+                               cnnl_output_desc, fft_plan->mlu_addrs.output,
+                               &beta, cnnl_output_desc,
+                               fft_plan->mlu_addrs.output));
+    status = mluOpDestroyTensorDescriptor(c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
+    DESTROY_CNNL_HANDLE(cnnl_handle);
+  }
+  INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+
+  if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
+    status = makeRFFT2dContiguousOutput(handle, fft_plan, output);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+  }
   return status;
 }
diff --git a/kernels/tensor_stride_process/tensor_stride_process_host.cpp b/kernels/tensor_stride_process/tensor_stride_process_host.cpp
index 410112258..bcb9685a6 100644
--- a/kernels/tensor_stride_process/tensor_stride_process_host.cpp
+++ b/kernels/tensor_stride_process/tensor_stride_process_host.cpp
@@ -484,7 +484,8 @@ mluOpContiguous(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc,
   DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc);
   DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(temp_desc, cnnl_temp_desc);
   CALL_CNNL(
-      cnnlCopy(cnnl_handle, cnnl_input_desc, input, cnnl_temp_desc, output));
+      cnnlCopy_v2(cnnl_handle, cnnl_input_desc, input, cnnl_temp_desc, output,
+                  NULL, 0));
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc);
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_temp_desc);
   DESTROY_CNNL_HANDLE(cnnl_handle);

From 6413b159af36cfed811f223bf0207a46d917d25c Mon Sep 17 00:00:00 2001
From: duzekun <duzekun96@gmail.com>
Date: Mon, 9 Dec 2024 10:32:19 +0800
Subject: [PATCH 5/7] [Docs](mlu-ops): Update docs for v1.4.1 (#1175)

Co-authored-by: duzekun <duzekun@cambricon.com>
---
 .github/workflows/daily.yaml                   |  4 ++--
 .github/workflows/mluops_ci.yaml               |  2 +-
 README.md                                      |  5 +++--
 build.property                                 |  4 ++--
 docs/api_guide/update.rst                      |  8 ++++++++
 docs/release_notes/mlu_ops.rst                 | 18 ++++++++++++++++++
 docs/user_guide/2_update_history/index.rst     |  7 +++++++
 .../centos7.5/SPECS/mluops-independent.spec    |  4 +++-
 installer/independent/debian/changelog         |  8 +++++++-
 mlu_op.h                                       |  2 +-
 10 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index f5b22c50e..c74c11930 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -12,9 +12,9 @@ jobs:
     strategy:
       matrix:
         runner: [mlu370-m8]
-        mlu_ops_version : [1.4.0]
+        mlu_ops_version : [1.4.1]
         cntoolkit_version : [3.15.2]
-        cnnl_version: [1.27.4]
+        cnnl_version: [1.28.0]
     runs-on: ${{matrix.runner}}
     steps:
       - uses: actions/checkout@v3
diff --git a/.github/workflows/mluops_ci.yaml b/.github/workflows/mluops_ci.yaml
index 5d2a7b5a1..395e074e8 100644
--- a/.github/workflows/mluops_ci.yaml
+++ b/.github/workflows/mluops_ci.yaml
@@ -39,7 +39,7 @@ jobs:
     strategy:
       matrix:
         runner: [mlu370-m8]
-        mlu_ops_version : [v1.4.0]
+        mlu_ops_version : [v1.4.1]
     runs-on: [yellow]
     steps:
       - uses: actions/checkout@v3
diff --git a/README.md b/README.md
index a1d8e75e5..9e4cf450c 100644
--- a/README.md
+++ b/README.md
@@ -49,8 +49,9 @@ MLU-OPS™提供了以下功能：
 ## 依赖条件
 
 - 操作系统：
-  - 支持 x86_64 架构下的 Ubuntu20.04、Centos7.6、Centos8.5、Kylin10
-  - MLU-OPS™ v1.0.0版本后将不再支持 Ubuntu18.04。Ubuntu22.04系统将在后续的版本提供支持。
+  - 支持 x86_64 架构下的 Ubuntu22.04、Centos7.6、Centos8.5、Kylin10
+  - MLU-OPS™ v1.0.0版本后将不再支持 Ubuntu18.04。
+  - MLU-OPS™ v1.4.1版本后将不再支持 Ubuntu20.04。
 - 寒武纪 MLU SDK：
   - 编译和运行时依赖 CNToolkit v3.15.2 或更高版本，CNNL v1.27.4 或者更高版本
 - 寒武纪 MLU 驱动：
diff --git a/build.property b/build.property
index 3351c140a..faaf3ea93 100644
--- a/build.property
+++ b/build.property
@@ -1,8 +1,8 @@
 {
-  "version": "1.4.0-1",
+  "version": "1.4.1-1",
   "python": "3.6.0",
   "build_requires": {"cntoolkit": ["release","3.15.2-1"],
-                     "cnnl":["release","1.27.4-1"],
+                     "cnnl":["release","1.28.0-1"],
                      "driver": "6.0.3",
                      "eigen3": "3.4.0",
                      "libxml2": "2.9.0",
diff --git a/docs/api_guide/update.rst b/docs/api_guide/update.rst
index 3c60f6b86..f6afd881c 100755
--- a/docs/api_guide/update.rst
+++ b/docs/api_guide/update.rst
@@ -3,6 +3,14 @@ Update History
 
 This section lists contents that were made for each product release.
 
+* V1.4.1
+
+  **Date:** December 5, 2024
+
+  **Changes:**
+
+  - None.
+
 * V1.4.0
 
   **Date:** November 29, 2024
diff --git a/docs/release_notes/mlu_ops.rst b/docs/release_notes/mlu_ops.rst
index 90cca642c..9766f0526 100644
--- a/docs/release_notes/mlu_ops.rst
+++ b/docs/release_notes/mlu_ops.rst
@@ -64,6 +64,24 @@ Cambricon MLU-OPS具有以下特点：
    +-----------------------------+------------------------+--------------------------------+
 
 
+v1.4.1
+-----------------
+
+特性变更
+~~~~~~~~~~~~~~~~~~~~~
+
+- 无。
+
+已修复问题
+~~~~~~~~~~~~~~~~~~~~~
+
+- 无。
+
+已知遗留问题
+~~~~~~~~~~~~~~~~~~~~~
+
+- 无。
+
 v1.4.0
 -----------------
 
diff --git a/docs/user_guide/2_update_history/index.rst b/docs/user_guide/2_update_history/index.rst
index 127da7bac..158f05b06 100644
--- a/docs/user_guide/2_update_history/index.rst
+++ b/docs/user_guide/2_update_history/index.rst
@@ -1,6 +1,13 @@
 更新历史
 ========
 
+* **V1.4.1**
+  **更新时间**：2024年12月5日
+
+  **更新内容**：
+
+  -  无算子更新。
+
 * **V1.4.0**
   **更新时间**：2024年11月29日
 
diff --git a/installer/centos7.5/SPECS/mluops-independent.spec b/installer/centos7.5/SPECS/mluops-independent.spec
index 92f8a6761..d9360e083 100644
--- a/installer/centos7.5/SPECS/mluops-independent.spec
+++ b/installer/centos7.5/SPECS/mluops-independent.spec
@@ -5,7 +5,7 @@
 
 Name: mluops
 Summary: The Machine Lerning Unit OPerators
-Version: 1.4.0
+Version: 1.4.1
 Release: 1%{?dist}
 License: Cambricon Release License
 Vendor: Cambricon Inc.
@@ -64,6 +64,8 @@ cp $RPM_SOURCE_DIR/neuware-env.conf $RPM_BUILD_ROOT/etc/ld.so.conf.d/
 %postun -p /sbin/ldconfig
 
 %changelog
+* Thu Dec 5 2024 Cambricon Software Team <service@cambricon.com>
+- release mluops v1.4.1
 * Thu Nov 29 2024 Cambricon Software Team <service@cambricon.com>
 - release mluops v1.4.0
 * Mon Oct 21 2024 Cambricon Software Team <service@cambricon.com>
diff --git a/installer/independent/debian/changelog b/installer/independent/debian/changelog
index 67ffb475d..82380f29f 100644
--- a/installer/independent/debian/changelog
+++ b/installer/independent/debian/changelog
@@ -1,8 +1,14 @@
+mluops (1.4.1-1.ubuntu16.04) xenial; urgency=medium
+
+  * Release mluops v1.4.1
+
+ -- Cambricon <service@cambricon.com>  Thu, 5 Dec 2024 00:00:00 +0100
+
 mluops (1.4.0-1.ubuntu16.04) xenial; urgency=medium
 
   * Release mluops v1.4.0
 
- -- Cambricon <service@cambricon.com>  Thu, 29 Nov 2024 00:00:00 +0100
+ -- Cambricon <service@cambricon.com>  Fri, 29 Nov 2024 00:00:00 +0100
 
 mluops (1.3.2-1.ubuntu16.04) xenial; urgency=medium
 
diff --git a/mlu_op.h b/mlu_op.h
index 5d8115c19..0dae7b1a4 100644
--- a/mlu_op.h
+++ b/mlu_op.h
@@ -29,7 +29,7 @@
 
 #define MLUOP_MAJOR 1
 #define MLUOP_MINOR 4
-#define MLUOP_PATCHLEVEL 0
+#define MLUOP_PATCHLEVEL 1
 /*********************************************************************************
  * MLUOP_VERSION is deprecated and not recommended. To get the version of MLUOP, use
  * MLUOP_MAJOR, MLUOP_MINOR and MLUOP_PATCHLEVEL.

From 639cc78773269e3e6e126e71602921b9d1ecf727 Mon Sep 17 00:00:00 2001
From: duzekun <duzekun96@gmail.com>
Date: Thu, 12 Dec 2024 16:02:54 +0800
Subject: [PATCH 6/7] [Feature](mlu-ops): Support mtp_613 (#1176)

---
 core/context.cpp                             | 47 ++++++++++----------
 core/context.h                               |  1 +
 independent_build.sh                         |  4 ++
 test/mlu_op_gtest/pb_gtest/mlu_op_test_proto |  2 +-
 4 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/core/context.cpp b/core/context.cpp
index dc1cea168..61ea8ba32 100644
--- a/core/context.cpp
+++ b/core/context.cpp
@@ -29,9 +29,9 @@
 #include "core/tool.h"
 #include "kernels/kernel.h"
 
-#define DEP_CHECK_LOG(level)                                              \
+#define DEP_CHECK_LOG(level)                                                \
   mluop::logging::LogMessage(__FILE__, __LINE__, 4, level, "MLU-OPS", true, \
-                             true, true, true)                            \
+                             true, true, true)                              \
       .stream()
 
 namespace mluop {
@@ -46,27 +46,23 @@ static struct deviceName name_list_table[] = {
     // case.
 };
 
-// once cnrtGetDeviceProperties() update and not use
-// device_ordinal, update this funciton.
-mluOpDevType_t convertDeviceName(char *name) {
-  struct deviceName *pName = NULL;
-  int num = sizeof(name_list_table) / sizeof(struct deviceName);
-  if (CONTEXT_DEVICENAME_LEAST_SIZE > strlen(name)) {
-    LOG(ERROR)
-        << "get device name failed. device name too short. device name = "
-        << name << "\n";
-    return MLUOP_UNKNOWN_DEVICE;
-  }
-  for (int i = 0; i < num; i++) {
-    pName = &name_list_table[i];
-    if (0 == strncmp(pName->name, name, strlen(pName->name)) ||
-        (i == num - 1 &&
-         0 >= strncmp(pName->name, name, CONTEXT_DEVICENAME_LEAST_SIZE))) {
-      return pName->type;
+mluOpDevType_t convertDeviceNameFromInt(int device_code) {
+  switch (device_code) {
+    case 372: {
+      return MLUOP_MLU370;
+      break;
+    }
+    case 592: {
+      return MLUOP_MLU590;
+      break;
+    }
+    case 613: {
+      return MLUOP_MTP613;
+      break;
     }
+    default:
+      break;
   }
-  LOG(ERROR) << "get device name failed. return unknown device. device name = "
-             << name << "\n";
   return MLUOP_UNKNOWN_DEVICE;
 }
 }  // namespace mluop
@@ -179,6 +175,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreate(mluOpHandle_t *handle) {
   int32_t persisting_l2cache_maxsize = 0;
   double memory_band_width = 0;
   char device_name[CONTEXT_DEVICENAME_BUFFER_SIZE] = "";
+  int device_code = 0;
   mluOpContext *ctx = new (std::nothrow) mluOpContext();
   CNcontext drv_ctx;
   CNctxConfigParam ctx_conf_param;
@@ -246,6 +243,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreate(mluOpHandle_t *handle) {
           cnDeviceGetAttribute(&persisting_l2cache_maxsize,
                                CN_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE,
                                mlu_dev));
+  INTERNAL_CHECK(
+      "[mluOpCreate]",
+      CN_SUCCESS == cnDeviceGetAttribute(&device_code,
+                                         CN_DEVICE_ATTRIBUTE_MLU_ISA_VERSION,
+                                         mlu_dev));
   INTERNAL_CHECK(
       "[mluOpCreate]",
       CN_SUCCESS == cnDeviceGetName(device_name, CONTEXT_DEVICENAME_BUFFER_SIZE,
@@ -266,8 +268,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreate(mluOpHandle_t *handle) {
   }
 
   ctx->capability_job_limit = (int32_t)ctx_conf_param.unionLimit;
-  ctx->arch = mluop::convertDeviceName(
-      device_name);  // warning: possible return unknown.
+  ctx->arch = mluop::convertDeviceNameFromInt(device_code);
   ctx->sram_size = sram_size - REM_FOR_STACK;
 
   strncpy(ctx->device_name, device_name, sizeof(device_name));
diff --git a/core/context.h b/core/context.h
index ab9fa9cae..e30fa94d9 100644
--- a/core/context.h
+++ b/core/context.h
@@ -57,6 +57,7 @@ typedef enum {
   MLUOP_MLU270 = 270,
   MLUOP_MLU370 = 372,
   MLUOP_MLU590 = 592,
+  MLUOP_MTP613 = 613,
   MLUOP_MLU290 = 290,
 } mluOpDevType_t;
 
diff --git a/independent_build.sh b/independent_build.sh
index 3676e8213..a601c5577 100755
--- a/independent_build.sh
+++ b/independent_build.sh
@@ -51,6 +51,7 @@ long_args=(
   help
   mlu370 # mlu arch
   mlu590
+  mtp613
   no_prepare
   perf
   prepare
@@ -68,6 +69,9 @@ add_mlu_arch_support () {
     --mlu590)
       bang_arch="mtp_592;"
       ;;
+    --mtp613)
+      bang_arch="mtp_613;"
+      ;;
     *)
       ;;
   esac
diff --git a/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto b/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto
index 55d028c4b..ce9149b87 160000
--- a/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto
+++ b/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto
@@ -1 +1 @@
-Subproject commit 55d028c4b2c79d594c1b6cfb04e60ec646c93bd8
+Subproject commit ce9149b87135a21eeac1df2f2d34219af3a0f41b

From acbe8c2043389126e577303167374f7b494d3566 Mon Sep 17 00:00:00 2001
From: chqy99 <141810829+chqy99@users.noreply.github.com>
Date: Thu, 12 Dec 2024 16:51:51 +0800
Subject: [PATCH 7/7] [Fix](mluOpRoiAlignRotatedForward): fix race_mem error
 (#1178)

---
 .../roi_align_rotated_forward_vector.md         | 17 +++++++++--------
 .../roi_align_rotated_forward_vector.mlu        |  2 ++
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/docs/design_docs/roi_align_rotated/roi_align_rotated_forward_vector.md b/docs/design_docs/roi_align_rotated/roi_align_rotated_forward_vector.md
index 233aefd2f..b7283cc83 100644
--- a/docs/design_docs/roi_align_rotated/roi_align_rotated_forward_vector.md
+++ b/docs/design_docs/roi_align_rotated/roi_align_rotated_forward_vector.md
@@ -333,6 +333,8 @@ __mlu_func__ void bilinearInterpolatePosWeight(
         w3[i] += w3[j];
         w4[i] += w4[j];
         w1[j] = -1;
+      } else {
+        break;
       }
     }
     if (unique_num != i) {
@@ -386,14 +388,13 @@ bin_hw_order_num = bin_order_num ^ 2。<br>
 | pos4 | sizeof(uint) * bin_hw_order_num | pos4 坐标 |
 
 
-剩余空间对齐均分为三份 vi, vi_t, val，记空间大小为 max_v_size。<br>
-其中 vi 复用多次，最终的 val_sum 也存储于 vi 中。<br>
-此时 max_once_c = max_v_size / unique_num / sizeof(T)。 <br>
+剩余空间对齐均分为两份 val, v_t，记空间大小为 max_v_size。<br>
+此时 max_once_c = max_v_size / 4 / unique_num / sizeof(T)。 <br>
 以float 类型为例：
-- 若 bin_order_num 为 32，固定的 size 为 53376, max_vi_size 为 113280。
-unique_num 最大可到 bin_hw_order_num(1024)，此时 max_once_c = 27。
-- 若 bin_order_num 为 8，固定的 size 为 7296, max_vi_size 为 128640。
-unique_num 最大可到 bin_hw_order_num(64)，此时 max_once_c = 502。
+- 若 bin_order_num 为 32，固定的 size 为 53376, max_vi_size 为 169920。
+unique_num 最大可到 bin_hw_order_num(1024)，此时 max_once_c = 10。
+- 若 bin_order_num 为 8，固定的 size 为 7296, max_vi_size 为 192960。
+unique_num 最大可到 bin_hw_order_num(64)，此时 max_once_c = 188。
 
 
 ### 3.4 性能优化设计
@@ -401,7 +402,7 @@ unique_num 最大可到 bin_hw_order_num(64)，此时 max_once_c = 502。
 2.减少重复计算，例如:roi_info 计算，bin_h、bin_w 二维序列构建等。
 3.使用 fuse.nram 融合三条以上的乘加法。
 4.双线性插值坐标进行查重，减少 IO 的数量。
-
+5.将周围四个点坐标搬运成连续向量，gather时一次性处理，在有效点较少时能提升 IO 效率。
 
 ### 3.5 可维护性设计
 
diff --git a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu
index e8c545e04..5cc589956 100644
--- a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu
+++ b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu
@@ -390,6 +390,7 @@ __mlu_global__ void roiAlignRotatedForward(
     if (params.sample_ratio < bin_order_num) {
       construct_order = false;
       // construct bin_w_idx in bin_loop
+      __sync();
       __memcpy_async(bin_w_order, order, params.sample_ratio * sizeof(T),
                      NRAM2NRAM, params.sample_ratio * sizeof(T), 0,
                      params.sample_ratio - 1);
@@ -449,6 +450,7 @@ __mlu_global__ void roiAlignRotatedForward(
 
           if (construct_order) {
             // construct bin_w_idx in bin_loop
+            __sync();
             __memcpy_async(bin_w_order, order, deal_bin_w * sizeof(T),
                            NRAM2NRAM, deal_bin_w * sizeof(T), 0,
                            deal_bin_h - 1);