From f0e390002237c78df2f533f1c8f1310b6f69df59 Mon Sep 17 00:00:00 2001
From: niyuming <niyuming@cambricon.com>
Date: Mon, 21 Oct 2024 17:40:14 +0800
Subject: [PATCH] [Fix](mluOpSetFFTReserveArea): fix fft bug

---
 kernels/fft/common/fft_common_kernels.mlu |  22 +-
 kernels/utils/common.h                    | 384 +++++++++++++++++++++-
 mlu_op.h                                  | 190 ++++++-----
 3 files changed, 499 insertions(+), 97 deletions(-)
diff --git a/kernels/fft/common/fft_common_kernels.mlu b/kernels/fft/common/fft_common_kernels.mlu
index 07bc98f96..8cca3a697 100644
--- a/kernels/fft/common/fft_common_kernels.mlu
+++ b/kernels/fft/common/fft_common_kernels.mlu
@@ -91,9 +91,8 @@ __mlu_func__ void genSinCosVec(float *src_addr, float *sin_addr,
  */
 __mlu_func__ void genSelectOffsetVec(float *offset_addr,
                                      int32_t *offset_int_addr, int deal_size) {
-  for (int i = 0; i < deal_size; i++) {
-    offset_int_addr[i] = (int)(offset_addr[i]);
-  }
+  __bang_mul_scalar(offset_addr, offset_addr, (float)sizeof(float), deal_size);
+  __bang_float2int32((int32_t *)offset_int_addr, offset_addr, deal_size, 0);
 }
 
 /*
@@ -106,9 +105,16 @@ __mlu_func__ void genSelectOffsetVec(float *offset_addr,
  */
 __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
                             float *dst_addr, int deal_size) {
+#if __BANG_ARCH__ >= 372 && __BANG_ARCH__ != 520
+  __asm__ volatile(
+      "gather.clean.nram.nram.nram.b32.u32 "
+      "[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
+      [src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
+#else
   for (auto i = 0; i < deal_size; i++) {
     dst_addr[i] = src_addr[offset_int_addr[i]];
   }
+#endif
 }
 
 /*
@@ -143,7 +149,7 @@ __mlu_func__ void generateRFFTHalfDFTMatrixImpl(int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos vectors
   const float scale = -2.0 * M_PI / n;
@@ -227,7 +233,7 @@ __mlu_func__ void generateRFFTFullDFTMatrixImpl(int row, int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos vectors
   const float scale = -2.0 * M_PI / n;
@@ -316,7 +322,7 @@ __mlu_func__ void generateIRFFTHalfDFTMatrixImpl(int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos coefficient vectors
   __bang_write_value((float *)cos_coeff_addr, deal_size, (float)2.0);
@@ -411,7 +417,7 @@ __mlu_func__ void generateIRFFTFullDFTMatrixImpl(int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos vectors
   const float scale = 2.0 * M_PI / n;
@@ -507,7 +513,7 @@ __mlu_func__ void generateC2CFFTDFTMatrixImpl(int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos vectors
   const float forward_scale = -2.0 * M_PI / n;
diff --git a/kernels/utils/common.h b/kernels/utils/common.h
index b6fe72bbf..06fe84e6a 100644
--- a/kernels/utils/common.h
+++ b/kernels/utils/common.h
@@ -25,6 +25,7 @@
 #ifndef KERNELS_UTILS_COMMON_H_
 #define KERNELS_UTILS_COMMON_H_
 
+#include <algorithm>
 #include <type_traits>
 
 #include "float.h"
@@ -74,9 +75,7 @@ __mlu_func__ void __mluop_float2half(half *dst, float *src, int src_count) {
   __bang_float2half_rn(dst, src, src_count);
 }
 
-__mlu_func__ half __mluop_float2half(float a) {
-  return __float2half_rn(a);
-}
+__mlu_func__ half __mluop_float2half(float a) { return __float2half_rn(a); }
 
 /******************************************************************************
  * MLUOP FUNC: __mluop_div
@@ -300,6 +299,176 @@ __mlu_func__ void __mluop_recursive_sum_pool(T *dst, int low_dim, int high_dim,
   return;
 }
 
+/*****************************************************************************
+ * MLUOPS FUNC: __mluop_int322float
+ * param 'dst' is the destination pointer in NRAM, same memory space as src
+ * required in NRAM
+ * param 'dst_addition' is the addition workspace of dst, requiring the same
+ * amount of space as dst in NRAM
+ * param 'src' is the source pointer in NRAM
+ * param 'src_addition' is the addition workspace of src, requiring only 128B
+ * space in NRAM
+ * param 'src_count' is the src element count
+ * Notes:
+ *   the sapces pointed by dst and src can not overlap
+ *   src_count*sizeof(float) should be divisible by 128
+ *   src input must be in range of [-2^23, 2^23-1] for MLU270 and MLU290
+ *****************************************************************************/
+__mlu_func__ void __mluop_int322float(float *dst, float *dst_addition,
+                                      int32_t *src, float *src_addition,
+                                      int32_t src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_int322float((float *)dst, (int32_t *)src, src_count, 0);
+#else
+  // get sign bit
+  int32_t seg_elem_count = 32;  // 128/sizeof(float) = 32
+  int32_t float_size = 4;       // sizeof(float) = 4
+  int32_t align_128 = 128;
+  float move_23bit = 8388608.0;
+  // 0x80000000 = 1,000000000,0000000000000000000000000000
+  __bang_write_value((unsigned *)src_addition, seg_elem_count,
+                     (unsigned)0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
+                    src_count * float_size, align_128);
+  // get 1 or 0 from sign bit
+  // judge is Odd
+  __bang_write_value((unsigned *)src_addition, seg_elem_count,
+                     (unsigned)0x00000001);
+  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
+                   (char *)src_addition, src_count * float_size, align_128);
+  __bang_write_value((unsigned *)src_addition, seg_elem_count,
+                     (unsigned)0x80000001);
+  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
+                  seg_elem_count);
+  // minus xor, positive num invariant
+  __bang_write_value((unsigned *)src_addition, seg_elem_count,
+                     (unsigned)0xffffffff);
+  __bang_cycle_mul(dst, dst_addition, src_addition, src_count, seg_elem_count);
+  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * float_size);
+  // convert int32 to float32
+  __bang_write_value((unsigned *)src_addition, seg_elem_count,
+                     (unsigned)0x7fffff);
+  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
+                    src_count * float_size, align_128);
+  __bang_write_value((unsigned *)src_addition, seg_elem_count,
+                     (unsigned)0x4b000000);
+  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
+                   src_count * float_size, align_128);
+  __bang_sub_scalar(dst, dst, move_23bit, src_count);
+  // add one
+  __bang_add(dst, dst, dst_addition, src_count);
+  // set sign for float32
+  __bang_write_value((unsigned *)src_addition, seg_elem_count,
+                     (unsigned)0xffffffff);
+  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
+                   seg_elem_count);
+
+  // fix on MLU300
+  __bang_write_value((unsigned *)src_addition, seg_elem_count,
+                     (unsigned)0x00000001);
+  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
+                   seg_elem_count);
+  // end fix
+
+  __bang_write_value((unsigned *)src_addition, seg_elem_count,
+                     (unsigned)0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * float_size, align_128);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
+             src_count * float_size);
+#endif
+}
+
+/*****************************************************************************
+ * MLUOPS FUNC: __mluop_float2int32
+ * param 'dst' is the destination pointer in NRAM, same memory space as src
+ * required in NRAM
+ * param 'dst_addition' is the addition workspace of dst, requiring the same
+ * amount of space as dst in NRAM
+ * param 'src' is the source pointer in NRAM
+ * param 'src_addition' is the addition workspace of src, requiring only 128B
+ * space in NRAM
+ * param 'src_count' is the src element count
+ * Notes:
+ *   the sapces pointed by dst and src can not overlap
+ *   src_count*sizeof(float) should be divisible by 128
+ *   src input must be in range of [-2^23, 2^23-1] for MLU270 and MLU290
+ *****************************************************************************/
+__mlu_func__ void __mluop_float2int32(int32_t *dst, float *dst_addition,
+                                      float *src, float *src_addition,
+                                      int32_t src_count) {
+#if __BANG_ARCH__ >= 372
+  __bang_float2int32_tz((int32_t *)dst, (float *)src, src_count, 0);
+#else
+  // sign ===> src_addition
+  // dst=-1.0 : when src[i] is a negative number
+  // dst=+1.0 : when src[i] is a positive number
+  int32_t floatDchar = sizeof(float) / sizeof(char);
+  __bang_active_sign((float *)dst, src, src_count);
+  // dst_addition = abs(src)
+  __bang_mul(dst_addition, src, (float *)dst, src_count);
+  // if dst_addition < 1.0, then src_addition + 1. to fix add error
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     1.0f);
+  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
+                  NFU_ALIGN_SIZE / sizeof(float));
+  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xbf800000);
+  // set negative flag -1.0 = 0xbf80000
+  __bang_cycle_eq(
+      (float *)dst, (float *)dst, (float *)src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  // to mask all src in [x < -1.0]
+  __bang_active_abs(dst_addition, src, src_count);
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     8388608.0f);
+  // mask shift move 23
+  __bang_cycle_add_tz(
+      dst_addition, dst_addition, src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  // right shift move 23bit
+  // dst=1.0, when src < -1.0
+  // dst=0.0, when src >=-1.0
+  __bang_sub(dst_addition, dst_addition, (float *)dst, src_count);
+  // to fix max value
+  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
+  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
+              src_count * floatDchar);
+  // get log 23bit
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     (unsigned)0x007fffff);
+  // mask low 23bit is 1
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * floatDchar,
+                    NFU_ALIGN_SIZE / sizeof(char));
+
+  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
+  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+  // src or dst_addition
+  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+#endif
+}
+
+__mlu_func__ void pvLock() {
+#if __BANG_ARCH__ == 270
+  if (__is_ipu()) {
+    __bang_lock(0, 0);
+  }
+#endif
+}
+
+__mlu_func__ void pvUnlock() {
+#if __BANG_ARCH__ == 270
+  if (__is_ipu()) {
+    __bang_unlock(0, 0);
+  }
+#endif
+}
+
 /******************************************************************************
  * MLUOPS FUNC: __mluop_load_str_2D
  * param 'size' is the getC size.
@@ -488,4 +657,213 @@ __mlu_vector__ void __mluop_get_indices(float *dst, float start_index,
   }
 }
 
+template <typename T>
+__mlu_func__ void __mlu_op_arange_base_(T *dst_nram, uint32_t numel,
+                                        T start_index, T step) {
+  for (uint32_t i = 0; i < numel; i++) {
+    dst_nram[i] = start_index + i * step;
+  }
+}
+
+#define MLUOP_ARANGE_VV_IMPL(VVType, vv_num, dst_nram, start_index, step) \
+  do {                                                                    \
+    VVType vv_index[8];                                                   \
+    __vv_index(vv_index[0], start_index, step);                           \
+    __vv_add(vv_index[1], vv_index[0], 1 * vv_num * step);                \
+    __vv_add(vv_index[2], vv_index[0], 2 * vv_num * step);                \
+    __vv_add(vv_index[3], vv_index[0], 3 * vv_num * step);                \
+    __vv_add(vv_index[4], vv_index[0], 4 * vv_num * step);                \
+    __vv_add(vv_index[5], vv_index[0], 5 * vv_num * step);                \
+    __vv_add(vv_index[6], vv_index[0], 6 * vv_num * step);                \
+    __vv_add(vv_index[7], vv_index[0], 7 * vv_num * step);                \
+    __vv_store(dst_nram, vv_index[0], vv_num);                            \
+    __vv_store(dst_nram + vv_num, vv_index[1], vv_num);                   \
+    __vv_store(dst_nram + 2 * vv_num, vv_index[2], vv_num);               \
+    __vv_store(dst_nram + 3 * vv_num, vv_index[3], vv_num);               \
+    __vv_store(dst_nram + 4 * vv_num, vv_index[4], vv_num);               \
+    __vv_store(dst_nram + 5 * vv_num, vv_index[5], vv_num);               \
+    __vv_store(dst_nram + 6 * vv_num, vv_index[6], vv_num);               \
+    __vv_store(dst_nram + 7 * vv_num, vv_index[7], vv_num);               \
+  } while (false)
+
+template <typename T>
+__mlu_vector__ void __mlu_op_arange_vv_(T *dst_nram, T start_index, T step) {
+#if 592 < _BANG_ARCH_
+  static_assert(
+      (std::is_same<T, float>::value || std::is_same<T, half>::value ||
+       std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value),
+      "__mlu_op_arange_vv type error!");
+#else  // #if 592 < _BANG_ARCH_
+  static_assert(
+      (std::is_same<T, float>::value || std::is_same<T, half>::value ||
+       std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value ||
+       std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value),
+      "__mlu_op_arange_vv type error!");
+#endif
+
+  const uint32_t vv_num = __vv_get_length() / sizeof(T);
+
+#if _BANG_ARCH_ <= 592
+  if (std::is_same<T, uint16_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_uint16, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, int16_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_int16, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, uint32_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_uint32, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, int32_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_int32, vv_num, dst_nram, start_index, step);
+  }
+#endif  // if _BANG_ARCH_ <= 592
+  if (std::is_same<T, uint16_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_uint16, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, int16_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_int16, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, float>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_float, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, half>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_half, vv_num, dst_nram, start_index, step);
+  }
+  return;
+}
+
+#if 592 < _BANG_ARCH_
+template <typename T>
+__mlu_func__ void __mlu_op_gen_integer_incr_seq_(T *dst_nram,
+                                                 uint32_t elem_count,
+                                                 T start = 0, T step = 1) {
+  static_assert(
+      (std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value ||
+       std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>),
+      "__mlu_op_gen_integer_incr_seq type error!");
+  if (std::is_same<T, uint32_t>::value) {
+    __bang_incseq(reinterpret_cast<int32_t *>(dst_nram), elem_count);
+  } else if (std::is_same<T, uint64_t>::value) {
+    __bang_incseq(reinterpret_cast<int64_t *>(dst_nram), elem_count);
+  } else {
+    __bang_incseq(dst_nram, elem_count);
+  }
+
+  if (start != 0) {
+    if (std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>::value) {
+      if (step != 1) {
+        __bang_mul_scalar(dst_nram, dst_nram, step, elem_count);
+      }
+      __bang_add_scalar(dst_nram, dst_nram, start, elem_count);
+    } else {
+      __bang_fusion(FUSION_FMA, dst_nram, dst_nram, step, start, elem_count);
+    }
+  }
+}
+#endif  // if 592 < _BANG_ARCH_
+
+#define u32_sizeof(T) ((uint32_t)sizeof(T))
+
+template <typename T>
+__mlu_func__ void __mlu_op_arange_by_expand_(T *dst_nram, uint32_t numel,
+                                             T start_index = 0, T step = 1) {
+#if 592 < _BANG_ARCH_
+  static_assert(
+      (std::is_same<T, float>::value || std::is_same<T, half>::value ||
+       std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value),
+      "__mlu_op_arange_by_expand type error!");
+#else   // if 592 < _BANG_ARCH_
+  static_assert(
+      (std::is_same<T, float>::value || std::is_same<T, half>::value ||
+       std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value ||
+       std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value ||
+       std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>::value),
+      "__mlu_op_arange_by_expand type error!");
+#endif  // if 592 < _BANG_ARCH_
+
+  // using AluGenSize = std::integral_constant<uint32_t, NFU_ALIGN_SIZE>;
+  using GuGenSize = std::integral_constant<uint32_t, 2048>;
+  uint32_t gu_gen_num = GuGenSize::value / u32_sizeof(T);
+  uint32_t alu_gen_num = NFU_ALIGN_SIZE / u32_sizeof(T);
+  uint32_t base_num = alu_gen_num;
+#if _BANG_ARCH_ <= 592
+  if (std::is_same<T, uint64_t>::value || std::is_same<T, int64_t>::value) {
+    const uint32_t prologue_num = std::min(numel, base_num);
+    __mlu_op_arange_base_(dst_nram, prologue_num, start_index, step);
+
+    if (numel <= base_num) {
+      return;
+    }
+  } else {
+    if (numel <= gu_gen_num) {
+      const uint32_t prologue_num = std::min(numel, base_num);
+      __mlu_op_arange_base_(dst_nram, prologue_num, start_index, step);
+
+      if (numel <= base_num) {
+        return;
+      }
+    } else {
+      __mlu_op_arange_vv_(dst_nram, start_index, step);
+      base_num = gu_gen_num;
+    }
+  }
+#else
+  if (numel <= gu_gen_num) {
+    const uint32_t prologue_num = std::min(numel, base_num);
+    __mlu_op_arange_base_(dst_nram, prologue_num, start_index, step);
+
+    if (numel <= base_num) {
+      return;
+    }
+  } else {
+    __mlu_op_arange_vv_(dst_nram, start_index, step);
+    base_num = gu_gen_num;
+  }
+#endif
+  // base_num = 2^exp
+  uint32_t exp = 0;
+  asm volatile("findlast1.gpr.b32 %[dst], %[src];\n\t"
+               : [ dst ] "+&r"(exp)
+               : [ src ] "r"(base_num));
+  // numel = count * base_num + remain
+  const uint32_t segnum = numel >> exp;
+  // count = 2^repeat
+  uint32_t repeat = 0;
+  asm volatile("findlast1.gpr.b32 %[dst], %[src];\n\t"
+               : [ dst ] "+&r"(repeat)
+               : [ src ] "r"(segnum));
+  uint32_t count = 1;
+  for (uint32_t i = 0; i < repeat; ++i) {
+    __bang_add_scalar(dst_nram + count * base_num, dst_nram,
+                      count * base_num * step, count * base_num);
+    count *= 2;
+  }
+
+  const uint32_t remain = numel - count * base_num;
+  if (0 < remain) {
+    __bang_add_scalar(dst_nram + count * base_num, dst_nram,
+                      count * base_num * step, remain);
+  }
+}
+/***************************************************************************
+
+    CNNL FUNC: __mlu_op_gen_stage_index.
+    param "dst_nram" is a nram pointer to the generated result.
+    param "numel" is the element number of to be generated.
+    param "start_index" is the starting value for the set of points. Default: 0.
+    param "step" is the gap between each pair of adjacent points points.
+   Default: 1. dst_addition. remarks: Detailed introduction for reference
+    http://wiki.cambricon.com/pages/viewpage.action?pageId=119467501.
+    int64_t and uint64_t types are under-optimized and can be improved with GU.
+    *************************************************************************/
+
+template <typename T>
+__mlu_func__ void __mlu_op_gen_stage_index(T *dst_nram, uint32_t numel,
+                                           T start_index = 0, T step = 1) {
+#if 592 < _BANG_ARCH_
+  if (std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value ||
+      std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>::value) {
+    __mlu_op_gen_integer_incr_seq_(dst_nram, numel, start_index, step);
+  } else {
+    __mlu_op_arange_by_expand_(dst_nram, numel, start_index, step);
+  }
+#else
+  __mlu_op_arange_by_expand_(dst_nram, numel, start_index, step);
+#endif
+}
+
 #endif  // KERNELS_UTILS_COMMON_H_
diff --git a/mlu_op.h b/mlu_op.h
index 26d61ce62..b6345ce70 100644
--- a/mlu_op.h
+++ b/mlu_op.h
@@ -362,7 +362,7 @@ typedef enum {
 typedef enum {
   MLUOP_REDUCE_DSUM  = 0, /*!< Computes the sum value. */
   MLUOP_REDUCE_DMEAN = 1, /*!< Computes the mean value. */
-  MLUOP_REDUCE_DMAX  = 2, /*!< Computes the maximun value. */
+  MLUOP_REDUCE_DMAX  = 2, /*!< Computes the maximum value. */
 } mluOpReduceMode_t;
 
 /*!
@@ -802,7 +802,7 @@ mluOpGetQuantizeRoundMode(mluOpHandle_t handle, mluOpQuantizeRoundMode_t *round_
 /*!
  * @brief Updates the specific atomics mode of MLU-OPS context information that is held by the
  * \b handle. This function should be called if you want to change the atomics mode that is
- * used to cumulate the results.For detailed information, see "Cambricon CNDrv Developer Guide".
+ * used to cumulate the results. For detailed information, see "Cambricon CNDrv Developer Guide".
  *
  * @param[in] handle
  * Pointer to a Cambricon MLU-OPS context that is used to manage MLU devices and queues. For detailed
@@ -1415,7 +1415,7 @@ mluOpSetTensorDescriptor(
  *   The descriptor of the tensor. For detailed information,
  *   see ::mluOpTensorDescriptor_t.
  * @param[in] pointer_mode
- *   The pointer mode of the input tensor. For detailed information, seee ::mluOpPointerMode_t.
+ *   The pointer mode of the input tensor. For detailed information, see ::mluOpPointerMode_t.
  * @par Return
  * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM
  *
@@ -1569,13 +1569,13 @@ mluOpSetTensorDescriptor_v2(mluOpTensorDescriptor_t desc,
  * convolution operation. If \b dimNb is set to 4, the output_space should be set in height and width
  * dimension. If \b dimNb is set to 5, the output_space should be set in depth, height and width dimension.
  * @param[in] sub_m
- * An value that determine the algorithms for sparse convolution. If \b sub_m is set to 0, the
+ * A value that determine the algorithms for sparse convolution. If \b sub_m is set to 0, the
  * algorithms will be the default sparse convolution. If \b sub_m is set to 0, the algorithms will be the
  * submanifold sparse convolution.
  * @param[in] transpose
- * An value that determines transpose.
+ * A value that determines transpose.
  * @param[in] inverse
- * An value that determines inverse.
+ * A value that determines inverse.
  *
  * @par Return
  * - ::MLUOP_STATUS_SUCCESS, ::MLUOP_STATUS_BAD_PARAM, ::MLUOP_STATUS_EXECUTION_FAILED
@@ -1977,7 +1977,7 @@ mluOpSetTensorDescriptorDim_v2(mluOpTensorDescriptor_t desc, int dimNb, const in
  * @param[in] desc
  * The descriptor of the tensor desc. For detailed information, see ::mluOpTensorDescriptor_t.
  * @param[in] onchip_dtype
- * The on-chip data type of the tensor is used in the functon that supports fixed-point
+ * The on-chip data type of the tensor is used in the function that supports fixed-point
  * computing.
  *
  * @par Return
@@ -2829,9 +2829,9 @@ mluOpInitTensorSetMemberDescriptor(mluOpTensorSetDescriptor_t tensorSetDesc,
  * - None.
  *
  * @par Note
- * - If the member tensor is in floating-point data type, and  you need to call
+ * - If the member tensor is in floating-point data type, and you need to call
  *   this function.
- * - If the member tensor is in fixed-point data type, and  you need to call
+ * - If the member tensor is in fixed-point data type, and you need to call
  *   this function.
  * - Before calling this function,
  *   You need to call ::mluOpCreateTensorSetDescriptor to create
@@ -3075,7 +3075,7 @@ mluOpLog(mluOpHandle_t handle,
 // Group: Carafe
 /*!
  * @brief Creates a descriptor pointed by \b carafe_desc for CARAFE upsampling forward and backward operations,
- * and allocates memory holding the configuration parameters.The information is defined in ::mluOpCarafeDescriptor_t.
+ * and allocates memory holding the configuration parameters. The information is defined in ::mluOpCarafeDescriptor_t.
  * For more information about descriptor, see "Cambricon MLU-OPS User Guide".
  *
  * @param[in] carafe_desc
@@ -3793,7 +3793,7 @@ mluOpGetDynamicPointToVoxelForwardWorkspaceSize(mluOpHandle_t handle,
  * - The first dimension of \b voxel_num tensor must be equal to \b voxel_feats_desc[0].
  *
  * @par API Dependency
- * - Before calling this function to perform unique operater, you need to get
+ * - Before calling this function to perform unique operator, you need to get
  *   the size of workspace by ::mluOpGetDynamicPointToVoxelForwardWorkspaceSize.
  *
  * @par Note
@@ -4047,7 +4047,7 @@ mluOpGetGenerateProposalsV2WorkspaceSize_v2(mluOpHandle_t handle,
  * - None.
  *
  * @par Note
- * - The operater does not support adaptive NMS.
+ * - The operator does not support adaptive NMS.
  * - The attribute `eta` should not be less than 1.
  * - ``nms_thresh`` should be more than 0.
  * - On MLU300 series and above:
@@ -4377,7 +4377,7 @@ mluOpSetNmsDescriptor(mluOpNmsDescriptor_t nms_desc,
  * @param[in] nms_desc
  * The descriptor of the Nms function. For detailed information, see ::mluOpNmsDescriptor_t.
  * @param[in] boxes_desc
- * The descriptor of the tensor \b boxes , including the information of dimension, data type, and
+ * The descriptor of the tensor \b boxes, including the information of dimension, data type, and
  * layout of input boxes. For detailed information, see ::mluOpTensorDescriptor_t.
  * @param[in] boxes
  * Pointer to the MLU memory that stores the input boxes tensor.
@@ -4479,7 +4479,7 @@ mluOpNms(mluOpHandle_t handle,
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and
  * queues in the Nms operation. For detailed information, see ::mluOpHandle_t.
  * @param[in] boxes_desc
- * The descriptor of the tensor \b boxes , which contains dimension, data type, and
+ * The descriptor of the tensor \b boxes, which contains dimension, data type, and
  * data layout of input \b boxes. For detailed information, see ::mluOpTensorDescriptor_t.
  * @param[in] confidence_desc
  * The descriptor of the tensor \b confidence , which contains dimension, data type, and
@@ -4568,7 +4568,7 @@ mluOpGetNmsWorkspaceSize(mluOpHandle_t handle,
  * @param[in] offset
  * The prior box center offset.
  * @param[in] clip
- * A bool value whether to clip out-of-boundary boxes.
+ * A Boolean value whether to clip out-of-boundary boxes.
  * @param[in] min_max_aspect_ratios_order
  * If the value is set as True, the \b output prior box is in
  * the order of [min, max, aspect_ratios]; otherwise the order is
@@ -4787,7 +4787,7 @@ mluOpPsRoiPoolForward(mluOpHandle_t handle,
 // Group: PsRoiPool
 /*!
  * @brief Computes the gradients of feature map \b bottom_grad based on the
- * inputs \b top_grad , \b rois , and \b mapping_channel to perform the backpropagation
+ * inputs \b top_grad, \b rois, and \b mapping_channel to perform the backpropagation
  * of ::mluOpPsRoiPoolForward.
  *
  * @param[in] handle
@@ -4945,7 +4945,7 @@ mluOpCreateRoiAlignForwardDescriptor(mluOpRoiAlignForwardDescriptor_t *desc);
  * If \b pool_mode is 1, the average pooling mode is used.
  * If \b pool_mode is 0, the maximum pooling mode is used.
  * @param[in] aligned
- * A boolean value which determines whether to shift the boxes by 0.5 pixel. If \b aligned
+ * A Boolean value which determines whether to shift the boxes by 0.5 pixel. If \b aligned
  * is true, the boxes is shifted by 0.5. If \b aligned is false, the boxes is not shifted.
  *
  * @par Return
@@ -5174,11 +5174,11 @@ mluOpRoiAlignForward_v2(mluOpHandle_t handle,
  * @param[in] spatial_scale
  * The spatial scale of each ROI in the output.
  * @param[in] aligned
- * A boolean value which determines whether to shift the ROI by 0.5 pixel. If the
+ * A Boolean value which determines whether to shift the ROI by 0.5 pixel. If the
  * value of \b aligned is set to true, the ROI is shifted by 0.5. If the value of \b aligned
  * is set to false, the ROI is not shifted.
  * @param[in] clockwise
- * A boolean value which determines whether the rotation of ROI is clockwise.
+ * A Boolean value which determines whether the rotation of ROI is clockwise.
  * @param[out] output_desc
  * The descriptor of output, which contains dimension and the layout of output.
  * @param[out] output
@@ -5284,11 +5284,11 @@ mluOpRoiAlignRotatedForward(mluOpHandle_t handle,
  * @param[in] spatial_scale
  * The spatial scale of each ROI in the output.
  * @param[in] aligned
- * A boolean value which determines whether to shift the ROI by 0.5 pixel.
+ * A Boolean value which determines whether to shift the ROI by 0.5 pixel.
  * If the value of \b aligned is set to true, the ROI is shifted by 0.5. If the value
  * of \b aligned is set to false, the ROI is not shifted.
  * @param[in] clockwise
- * A boolean value which determines whether the rotation of ROI is clockwise.
+ * A Boolean value which determines whether the rotation of ROI is clockwise.
  * @param[in] bottom_grad_desc
  * The descriptor of the tensor \b bottom_grad.
  * @param[out] bottom_grad
@@ -5843,7 +5843,7 @@ mluOpSqrtBackward(mluOpHandle_t handle,
  * @param[in] NDim
  * An integer value which is the second dimension of coors.
  * @param[in] deterministic
- * A bool value whether to invoke the non-deterministic
+ * A Boolean value whether to invoke the non-deterministic
  * version of hard-voxelization implementations. Currently,
  * non-deterministic mode is not supported.
  * @param[in] voxels_desc
@@ -5933,11 +5933,11 @@ mluOpGetVoxelizationWorkspaceSize(mluOpHandle_t handle,
  * in a voxel.
  * @param[in] max_voxels
  * An integer value which is the maximum number of voxels this
- * function create.
+ * function creates.
  * @param[in] NDim
  * An integer value which is the second dimension of coors.
  * @param[in] deterministic
- * A bool value whether to invoke the non-deterministic
+ * A Boolean value whether to invoke the non-deterministic
  * version of hard-voxelization implementations. Currently,
  * non-deterministic mode is not supported.
  * @param[in] workspace
@@ -6086,7 +6086,7 @@ mluOpVoxelization(mluOpHandle_t handle,
  * @par Scale Limitation
  * - The first dimension of x tensor, img_size tensor, boxes tensor and scores
  *   tensor must be the same size.
- * - The second dimension (the channel dimension) of x tensor , C should be equal to S * (5 +
+ * - The second dimension (the channel dimension) of x tensor, C should be equal to S * (5 +
  *   class_num) if \b iou_aware is false, otherwise C should be equal to S * (6 + class_num),
  *   the value S is equal to the anchors tensor size divided by 2.
  * - The first dimension of anchors tensor should be larger than 0.
@@ -6246,7 +6246,7 @@ mluOpVoxelPoolingForward(mluOpHandle_t handle,
  * IOU (Intersection Over Union) or IOF (Intersection Over Foreground).
  * The integer 0 represents IOU and 1 represents IOF.
  * @param[in] aligned
- * A boolean value. If it is false, then calculate the IOU[i][j]
+ * A Boolean value. If it is false, then calculate the IOU[i][j]
  * or IOF[i][j] between the row i of \b bbox1 and the row j of \b bbox2,
  * otherwise calculate the IOU[i] or IOFs[i] between the row i of \b bbox1
  * and the row i of \b bbox2. Significantly, the numbers of rows of \b bbox1
@@ -6398,7 +6398,7 @@ mluOpGetNmsRotatedWorkspaceSize(mluOpHandle_t handle, const mluOpTensorDescripto
  *
  * @par Data Type
  * - By the order of \b boxes - \b scores - \b output, the supported data types of
- *   \b boxes , \b scores , and \b output tensors are as follows:
+ *   \b boxes, \b scores, and \b output tensors are as follows:
  *   - float - float - int32
  *
  * @par Scale Limitation
@@ -6448,9 +6448,9 @@ mluOpNmsRotated(mluOpHandle_t handle,
  * An integer value which decides to return a result IOU or IOF.
  * The integer 0 represents IOU and 1 represents IOF.
  * @param[in] aligned
- * A boolean value. If it is false, this operation calculates the IOUs[i][j] or IOFs[i][j] between
+ * A Boolean value. If it is false, this operation calculates the IOUs[i][j] or IOFs[i][j] between
  * the row i of \b bbox1 and the row j of \b bbox2, otherwise the IOU[i] or IOF[i] between
- * the row i of \b bbox1 and the row i of \b bbox2 are calculated. The number of row of \b bbox1
+ * the row i of \b bbox1 and the row i of \b bbox2 are calculated. The number of rows of \b bbox1
  * and \b bbox2 must be equal if \b aligned is true.
  * @param[in] offset
  * An integer value determines whether to increase the length and the width of the bounding-box by 0 or 1
@@ -6606,12 +6606,12 @@ mluOpBboxOverlaps(mluOpHandle_t handle,
  *   - output tensor: \p MLUOP_LAYOUT_ARRAY
  *
  * @par Scale Limitation
- * - The dimension of \b features , \b indices , \b weights , and \b output
+ * - The dimension of \b features, \b indices, \b weights, and \b output
  *   should be equal to 3.
- * - The shape[0] of \b features , \b indices , \b weights , and \b output
+ * - The shape[0] of \b features, \b indices, \b weights, and \b output
  *   should be the same.
  * - The shape[1] of \b features and \b output should be the same.
- * - The shape[1] of \b indices , \b weights , and the shape[2] of \b output
+ * - The shape[1] of \b indices, \b weights, and the shape[2] of \b output
  *   should be the same.
  * - The shape[2] of \b indices and \b weights should be equal to 3.
  *
@@ -6642,7 +6642,7 @@ mluOpThreeInterpolateForward(mluOpHandle_t handle,
 // Group: ThreeInterpolate
 /*!
  * @brief Computes the gradients of feature map \b grad_features based on the
- * inputs \b grad_output , \b indices , and \b weights to perform the backpropagation
+ * inputs \b grad_output, \b indices, and \b weights to perform the backpropagation
  * of ::mluOpThreeInterpolateForward.
  *
  * @param[in] handle
@@ -6865,7 +6865,7 @@ mluOpBallQuery(mluOpHandle_t handle,
  *   ::MLUOP_STATUS_EXECUTION_FAILED
  *
  * @par Data Type
- * - The supported data types of input tensors \b input, \b target, \b weight , and output
+ * - The supported data types of input tensors \b input, \b target, \b weight, and output
  *   tensor \b output are as follows:
  *   - input: half, float
  *   - target: int32
@@ -6979,7 +6979,7 @@ mluOpFocalLossSigmoidForward(mluOpHandle_t handle,
  *   ::MLUOP_STATUS_EXECUTION_FAILED
  *
  * @par Data Type
- * - The supported data types of input tensor \b input, \b target, \b weight , and output
+ * - The supported data types of input tensor \b input, \b target, \b weight, and output
  *   tensor \b output are as follows:
  *   - input: float, half
  *   - target: int32
@@ -7155,7 +7155,7 @@ mluOpGetMaskedIm2colForwardWorkspaceSize(mluOpHandle_t handle,
  *   - data_col tensor: half, float.
  *
  * @par Data Layout
- * - The supported data layouts of \b feature , \b mask_h_idx , \b mask_w_idx , and \b data_col are as follows:
+ * - The supported data layouts of \b feature, \b mask_h_idx, \b mask_w_idx, and \b data_col are as follows:
  *   - feature tensor: \p MLUOP_LAYOUT_NCHW.
  *   - mask_h_idx tensor: \p MLUOP_LAYOUT_ARRAY.
  *   - mask_w_idx tensor: \p MLUOP_LAYOUT_ARRAY.
@@ -7276,8 +7276,8 @@ mluOpMaskedIm2colForward(mluOpHandle_t handle,
  *   tensor must be the same size and equal to \b samples .
  * - The second dimension of \b grad_input tensor and \b dispatch tensor must be equal to \b hidden .
  * - The first dimension of \b dispatch tensor must be equal to the multiplication result of
- *   the \b capacity and \b num_experts .
- * - The value of the input parameters \b samples , \b capacity , \b hidden , and \b num_experts
+ *   the \b capacity and \b num_experts.
+ * - The value of the input parameters \b samples, \b capacity , \b hidden , and \b num_experts
  *   must be greater than or equal to 0.
  * - The value range of the input parameter \b indices tensor must be greater than or equal to 0 and less than
  *   \b num_experts.
@@ -7289,7 +7289,7 @@ mluOpMaskedIm2colForward(mluOpHandle_t handle,
  *
  * @par Note
  * - This function is only supported on MLU300 series or above platforms.
- * - The parameter \b samples , \b capacity , \b hidden , and \b num_experts should not be negative.
+ * - The parameter \b samples, \b capacity , \b hidden , and \b num_experts should not be negative.
  *
  * @par Example
  * - The example of the function is as follows:
@@ -7459,7 +7459,7 @@ mluOpMsDeformAttnBackward(mluOpHandle_t handle,
  * The descriptor of the tensor \b ans_grad containing dimension, data type, and data layout.
  * For detailed information, see ::mluOpTensorDescriptor_t.
  * @param[in] overwrite_ans_grad
- * A boolean value indicating whether to overwrite \b ans_grad.
+ * A Boolean value indicating whether to overwrite \b ans_grad.
  * @param[out] workspace_size
  * Pointer to the MLU memory that stores the returned size of the extra workspace in bytes.
  *
@@ -7530,7 +7530,7 @@ mluOpGetMutualInformationBackwardWorkspaceSize(mluOpHandle_t handle,
  * @param[in] ans_grad
  * Pointer to the MLU memory that stores the tensor \b ans_grad.
  * @param[in] overwrite_ans_grad
- * A boolean value indicating whether to overwrite \b ans_grad.
+ * A Boolean value indicating whether to overwrite \b ans_grad.
  * @param[in] workspace
  * Pointer to the MLU memory as an extra workspace for the mutual_information_backward operation.
  * For more information about the workspace, see "Cambricon MLU-OPS User Guide".
@@ -7881,7 +7881,7 @@ mluOpGetRoiAwarePool3dForwardWorkspaceSize(mluOpHandle_t handle,
  * @brief Returns \b argmax, \b pts_idx_of_voxels and \b pooled_features calculated by
  * this operator.
  *
- * The operator determine the points in each box based on input coordinates. The collection
+ * The operator determines the points in each box based on input coordinates. The collection
  * of points in boxes are named as voxels and recorded as \b pts_idx_of_voxels. The operator
  * also performs max pooling or average pooling on the voxels and results in \b argmax
  * and \b pooled_features.
@@ -8015,7 +8015,7 @@ mluOpRoiawarePool3dForward(mluOpHandle_t handle,
  * @brief Returns \b argmax, \b pts_idx_of_voxels and \b pooled_features calculated by
  * this operator.
  *
- * The operator determine the points in each box based on input coordinates. The collection
+ * The operator determines the points in each box based on input coordinates. The collection
  * of points in boxes are named as voxels and recorded as \b pts_idx_of_voxels. The operator
  * also performs max pooling or average pooling on the voxels and results in \b argmax
  * and \b pooled_features.
@@ -8332,7 +8332,7 @@ mluOpRoiAwarePool3dBackward(mluOpHandle_t handle,
 
 // Group: Psamask
 /*!
- * @brief Moves the \b x tensor to \b y tensor according to \b h_mask , \b w_mask , and \b psa_type.
+ * @brief Moves the \b x tensor to \b y tensor according to \b h_mask, \b w_mask, and \b psa_type.
  *
  *
  * @param[in] handle
@@ -8407,7 +8407,7 @@ mluOpPsamaskForward(mluOpHandle_t handle,
 // Group: Psamask
 /*!
  * @brief Computes the gradients of input tensor \b dx with the gradients of output tensor \b dy
- * according to \b h_mask , \b w_mask , and \b psa_type.
+ * according to \b h_mask, \b w_mask, and \b psa_type.
  *
  * @param[in] handle
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and
@@ -8481,7 +8481,7 @@ mluOpPsamaskBackward(mluOpHandle_t handle,
 // Group: SparseConv
 /*!
  * @brief Computes the get_indice_paris operation, then returns the results in the output
- * tensor \b out_indices , \b indice_pairs and \b ind, ice_num.
+ * tensor \b out_indices, \b indice_pairs and \b ind, ice_num.
  *
  * @param[in] handle
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the
@@ -8525,7 +8525,7 @@ mluOpPsamaskBackward(mluOpHandle_t handle,
  * @par Data Type
  * - This function supports the combinations of the following data types for
  *   input tensor \b indices and output tensor \b out_indices, \b indice_pairs and \b indice_num.
- * - \b indices , \b out_indices , \b indice_pairs , and \b indice_num data type: int32, int32, int32, int32
+ * - \b indices, \b out_indices, \b indice_pairs, and \b indice_num data type: int32, int32, int32, int32
  *
  * @par Data Layout
  * - None.
@@ -8583,8 +8583,8 @@ mluOpGetIndicePairs(mluOpHandle_t handle,
  * to optimize the get_indice_pairs operation.
  *
  * The size of extra workspace is based on the given information of the get_indice_pairs
- * operation, including the input tensor descriptor \b sparse_conv_desc , and \b indices_desc, output
- * tensor descriptor \b out_indices_desc , \b indice_pairs_desc , and \b indice_num_desc.
+ * operation, including the input tensor descriptor \b sparse_conv_desc, and \b indices_desc, output
+ * tensor descriptor \b out_indices_desc, \b indice_pairs_desc, and \b indice_num_desc.
  *
  * @param[in] handle
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the
@@ -8622,7 +8622,7 @@ mluOpGetIndicePairs(mluOpHandle_t handle,
  *
  * @par API Dependency
  * - You need to call ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor to create and set
- *   tensor descriptors \b indices_desc , \b out_indices_desc , \b indice_pairs_desc , and \b indice_num_desc before
+ *   tensor descriptors \b indices_desc, \b out_indices_desc, \b indice_pairs_desc, and \b indice_num_desc before
  *   calling this function.
  * - You need to call ::mluOpCreateSparseConvolutionDescriptor to create a descriptor,
  *   and call ::mluOpSetSparseConvolutionDescriptor to set the tensor information for
@@ -8935,7 +8935,7 @@ mluOpCreateAdamWDescriptor(mluOpAdamWDescriptor_t *adamw_desc);
 /*!
  * @brief Initializes the descriptor \b adamw_desc that was previously created with
  * ::mluOpCreateAdamWDescriptor function, and sets AdamW information
- * to the descriptor \b adamw_desc. The information includes \b weight_decay , \b grad_scale
+ * to the descriptor \b adamw_desc. The information includes \b weight_decay, \b grad_scale
  * and \b use_nesterov for AdamW operation.
  *
  * @param[in] adamw_desc
@@ -9119,7 +9119,7 @@ mluOpDeformRoiPoolForward(const mluOpHandle_t handle,
 // Group: DeformRoiPool
 /*!
  * @brief Computes the gradient of input \b grad_input and the gradient of offset \b grad_offset
- * based on the gradient of output \b grad_output , input \b input , ROI \b rois , and offset \b offset.
+ * based on the gradient of output \b grad_output, input \b input, ROI \b rois, and offset \b offset.
  *
  * @param[in] handle
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in
@@ -9286,11 +9286,11 @@ mluOpDeformRoiPoolBackward(const mluOpHandle_t handle,
  *   - boxes tensor: half, float
  *   - output tensor: half, float
  *   - argmax_idx tensor: int32_t
- *   Note that the data type of \b input , \b boxes , and \b output
+ *   Note that the data type of \b input, \b boxes, and \b output
  *   must be the same.
  *
  * @par Data Layout
- * - The supported data layout of \b input , \b boxes , \b output , and
+ * - The supported data layout of \b input, \b boxes, \b output, and
  *   \b argmax_idx are as follows:
  *   - input tensor: \p MLUOP_LAYOUT_NHWC
  *   - boxes tensor: \p MLUOP_LAYOUT_ARRAY
@@ -9386,7 +9386,7 @@ mluOpBorderAlignForward(mluOpHandle_t handle,
 // Group: BorderAlign
 /*!
  * @brief Computes the gradient of the input tensor of ::mluOpBorderAlignForward
- * according to the output gradient \b grad_output , the maximum pooling index \b
+ * according to the output gradient \b grad_output, the maximum pooling index \b
  * argmax_idx and bounding boxes \b boxes .
  *
  * @param[in] handle
@@ -9407,14 +9407,14 @@ mluOpBorderAlignForward(mluOpHandle_t handle,
  * Pointer to the MLU memory that stores \b boxes tensors. The shape of \b boxes is
  * [N, H * W, 4].
  * @param[in] argmax_idx_desc
- * Descriptor of \b argmax_idx , containing dimension and the layout of \b argmax_idx .
+ * Descriptor of \b argmax_idx, containing dimension and the layout of \b argmax_idx .
  * @param[in] argmax_idx
  * Pointer to the MLU memory that stores the \b argmax_idx tensor, which is the result
  * of max pooling index. The shape of argmax_idx is [N, K, 4, C].
  * @param[in] pool_size
  * Number of positions sampled over the boxes borders.
  * @param[in] grad_input_desc
- * Descriptor of \b grad_input , containing dimension and the layout of output.
+ * Descriptor of \b grad_input, containing dimension and the layout of output.
  * @param[out] grad_input
  * Pointer to the MLU memory that stores the gradient of the input
  * tensor of ::mluOpBorderAlignForward. The shape of \b grad_input is [N, H, W, 4C],
@@ -9431,11 +9431,11 @@ mluOpBorderAlignForward(mluOpHandle_t handle,
  *   - boxes tensor: half, float
  *   - argmax_idx tensor: int32_t
  *   - grad_input tensor: half, float
- *   Note that the data type of \b grad_output , \b boxes , and \b grad_input
+ *   Note that the data type of \b grad_output, \b boxes, and \b grad_input
  *   must be the same.
  *
  * @par Data Layout
- * - The supported data layout of \b grad_output , \b boxes , \b argmax_idx and,
+ * - The supported data layout of \b grad_output, \b boxes, \b argmax_idx and,
  *   \b grad_input are as follows:
  *   - grad_output tensor: \p MLUOP_LAYOUT_NHWC
  *   - boxes tensor: \p MLUOP_LAYOUT_ARRAY
@@ -9443,7 +9443,7 @@ mluOpBorderAlignForward(mluOpHandle_t handle,
  *   - grad_input tensor: \p MLUOP_LAYOUT_NHWC
  *
  * @par Scale Limitation
- * - The \b grad_output , \b argmax_idx and \b grad_input are 4D tensor.
+ * - The \b grad_output, \b argmax_idx and \b grad_input are 4D tensor.
  * - The \b boxes is 3D tensor.
  * - The dims[3] of \b boxes should be equal to 4.
  * - The shape of \b grad_output and \b argmax_idx must be the same.
@@ -9533,9 +9533,9 @@ mluOpBorderAlignBackward(mluOpHandle_t handle,
  *
  * The size of extra workspace is based on the given information of the indice
  * convolution backward data operation, including the input descriptor
- * \b input_grad_desc, the filter descriptor \b filter_desc , the indice pairs
- * descriptor \b indice_pairs_desc , the output descriptor \b indice_pairs_desc ,
- * the array \b indice_num , and the scaler \b inverse. For more information
+ * \b input_grad_desc, the filter descriptor \b filter_desc, the indice pairs
+ * descriptor \b indice_pairs_desc, the output descriptor \b indice_pairs_desc,
+ * the array \b indice_num, and the scaler \b inverse. For more information
  * about the workspace, see "Cambricon MLU-OPS User Guide".
  *
  * @param[in] handle
@@ -9577,7 +9577,7 @@ mluOpBorderAlignBackward(mluOpHandle_t handle,
  * @par API Dependency
  * - This function must be called before ::mluOpIndiceConvolutionBackwardData.
  * - ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor
- *   create and set the tensor descriptor \b output_grad_desc , \b filters_desc ,
+ *   create and set the tensor descriptor \b output_grad_desc, \b filters_desc,
  *   \b indice_pairs_desc and \b input_grad_desc before this function is called.
  *
  * @par Note
@@ -9603,7 +9603,7 @@ mluOpGetIndiceConvolutionBackwardDataWorkspaceSize(mluOpHandle_t handle,
 /*!
  * @brief Performs the back propagation of an indice convolution operation to
  * compute the gradient of input \b input_grad based on the gradient of response
- * \b output_grad , the filter tensor \b filter , the indice tensor \b indice_pairs ,
+ * \b output_grad, the filter tensor \b filter, the indice tensor \b indice_pairs,
  * and helper parameters: array \b indice_num, scaler \b inverse and \b sub_m.
  *
  * The tensors \b input_grad and \b output_grad are reordered from origin input
@@ -9745,8 +9745,8 @@ mluOpIndiceConvolutionBackwardData(mluOpHandle_t handle,
  * to optimize the indice_convolution_backward_filter operation.
  *
  * The size of extra workspace is based on the given information of the indice_convolution_backward_filter
- * operation, including the input tensor descriptor \b features_desc , \b output_grad_desc , and \b indice_pairs_desc ,
- * output tensor descriptor \b filters_grad_desc , and the array \b indice_num[].
+ * operation, including the input tensor descriptor \b features_desc, \b output_grad_desc, and \b indice_pairs_desc,
+ * output tensor descriptor \b filters_grad_desc, and the array \b indice_num[].
  *
  * @param[in] handle
  * Handle to a Cambricon MLU-OPS context that is used to manage MLU devices and queues in the
@@ -9787,7 +9787,7 @@ mluOpIndiceConvolutionBackwardData(mluOpHandle_t handle,
  *
  * @par API Dependency
  * - You need to call ::mluOpCreateTensorDescriptor and ::mluOpSetTensorDescriptor to create and set
- *   tensor descriptors \b features_desc , \b output_grad_desc , \b indice_pairs_desc , and \b filters_grad_desc before
+ *   tensor descriptors \b features_desc, \b output_grad_desc, \b indice_pairs_desc, and \b filters_grad_desc before
  *   calling this function.
  * - The allocated extra workspace should be passed to ::mluOpIndiceConvolutionBackwardFilter to
  *   perform the indice_convolution_backward_filter operation.
@@ -9859,9 +9859,9 @@ mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize(mluOpHandle_t handle,
  *
  * @par Data Type
  * - This function supports the combinations of the following data types for
- *   input tensor \b features , \b output_grad , \b indice_pairs_num , and output tensor \b filters_grad.
- *   - \b features , \b output_grad , \b indice_pairs , \b filters_grad data type: half, half, int32, half
- *   - \b features , \b output_grad , \b indice_pairs , \b filters_grad data type: float, float, int32, float
+ *   input tensor \b features, \b output_grad, \b indice_pairs_num, and output tensor \b filters_grad.
+ *   - \b features, \b output_grad, \b indice_pairs, \b filters_grad data type: half, half, int32, half
+ *   - \b features, \b output_grad, \b indice_pairs, \b filters_grad data type: float, float, int32, float
  *
  * @par Data Layout
  * - None.
@@ -10049,7 +10049,7 @@ mluOpGetRoiPointPool3dWorkspaceSize(mluOpHandle_t handle,
  *
  * @par Data Type
  * - The supported data types for input and output are as follows:
- *   Note that the data type of \b points , \b point_features , \b boxes3d , and
+ *   Note that the data type of \b points, \b point_features, \b boxes3d , and
  *   \b pooled_features must be the same.
  *   - points: half, float
  *   - point_features: half, float
@@ -10190,9 +10190,9 @@ mluOpGetThreeNNForwardWorkspaceSize(const mluOpHandle_t handle,
  * @par Scale Limitation
  * - The shape of \b unknown, \b dist2 and \b idx should be [b, n, 3].
  * - The shape of \b known should be [b, m, 3].
- * - The shape of \b unknown , \b dist2 , \b idx , and \b known dims[0](b) should be equal.
- * - The shape of \b unknown , \b dist2 , \b idx , and \b known dims[2](3) should be equal to 3.
- * - The shape of \b unknown , \b dist2 , \b idx , and \b known dims[1](n) should be equal and larger
+ * - The shape of \b unknown , \b dist2 , \b idx, and \b known dims[0](b) should be equal.
+ * - The shape of \b unknown , \b dist2 , \b idx, and \b known dims[2](3) should be equal to 3.
+ * - The shape of \b unknown , \b dist2 , \b idx, and \b known dims[1](n) should be equal and larger
  *   than 0.
  *
  * @par API Dependency
@@ -10346,9 +10346,9 @@ mluOpGetIndiceConvolutionForwardWorkspaceSize(mluOpHandle_t handle,
  *
  * @par Data Type
  * - This function supports the combination of the following data types:
- *   - input tensor \b features , \b filters , \b indice_pairs , and output tensor \b features_out: half, half, int32,
+ *   - input tensor \b features, \b filters, \b indice_pairs, and output tensor \b features_out: half, half, int32,
  * half
- *   - input tensor \b features , \b filters , \b indice_pairs , and output tensor \b features_out: float, float, int32,
+ *   - input tensor \b features, \b filters, \b indice_pairs, and output tensor \b features_out: float, float, int32,
  * float
  * - The supported data type of array \b indice_num , scalar \b inverse , and \b sub_m is int64.
  *
@@ -10461,8 +10461,8 @@ mluOpIndiceConvolutionForward(mluOpHandle_t handle,
  *   ::MLUOP_STATUS_NOT_SUPPORTED, ::MLUOP_STATUS_EXECUTION_FAILED
  *
  * @par Data Type
- * - This function supports the following data types for input tensors \b gates , \b indices ,
- *   \b locations , \b input , and \b dispatch.
+ * - This function supports the following data types for input tensors \b gates, \b indices,
+ *   \b locations, \b input , and \b dispatch.
  *   - gates tensor: float
  *   - indices tensor: int32
  *   - locations tensor: int32
@@ -10477,7 +10477,7 @@ mluOpIndiceConvolutionForward(mluOpHandle_t handle,
  *   tensor must be the same size and equal to \b samples.
  * - The second dimension of \b input tensor and \b dispatch tensor must be equal to \b hidden .
  * - The first dimension of \b dispatch tensor must be equal to the multiplication result of
- *   the \b capacity and \b num_experts .
+ *   the \b capacity and \b num_experts.
  * - The samples must be less than or equal to the multiplication result of the \b capacity and \b
  *   num_experts.
  * - The values of indices must be between 0 and (num_experts-1) .
@@ -10488,7 +10488,7 @@ mluOpIndiceConvolutionForward(mluOpHandle_t handle,
  *
  * @par Note
  * - This function is only supported on MLU300 series or above platforms.
- * - The parameters \b samples , \b capacity , \b hidden , and \b num_experts should not be negative.
+ * - The parameters \b samples, \b capacity , \b hidden , and \b num_experts should not be negative.
  *
  * @par Example
  * - The example of the function is as follows:
@@ -10609,7 +10609,7 @@ mluOpGetMoeDispatchBackwardGateWorkspaceSize(mluOpHandle_t handle,
  * @param[in] workspace_size
  * The size of the extra workspace in bytes.
  * @param[in] grad_gates_desc
- * The descriptor of the tensor \b grad_gates , which contains dimension, data type, and data layout.
+ * The descriptor of the tensor \b grad_gates, which contains dimension, data type, and data layout.
  * For detailed information, see ::mluOpTensorDescriptor_t.
  * @param[out] grad_gates
  * Pointer to the MLU memory that stores the \b grad_gates tensor.
@@ -10635,7 +10635,7 @@ mluOpGetMoeDispatchBackwardGateWorkspaceSize(mluOpHandle_t handle,
  * - The second dimension of \b input tensor and \b dispatch tensor must be equal to \b hidden.
  * - The first dimension of \b dispatch tensor must be equal to the multiplication result of
  *   the \b capacity and \b num_experts.
- * - The value of the input parameters \b samples , \b capacity , \b hidden , and \b num_experts
+ * - The value of the input parameters \b samples, \b capacity , \b hidden , and \b num_experts
  *   must be greater than or equal to 0.
  * - The value range of the input parameter \b indices tensor must be greater than or equal to 0 and less than
  *   \b num_experts.
@@ -10648,7 +10648,7 @@ mluOpGetMoeDispatchBackwardGateWorkspaceSize(mluOpHandle_t handle,
  *
  * @par Note
  * - This function is only supported on MLU300 series or above platforms.
- * - The parameters \b samples , \b capacity , \b hidden , and \b num_experts should not be negative.
+ * - The parameters \b samples, \b capacity , \b hidden , and \b num_experts should not be negative.
  *
  * @par Example
  * - The example of the operation is as follows:
@@ -10772,7 +10772,7 @@ mluOpPointsInBoxes(mluOpHandle_t handle,
  * @param[in] sampling_ratio
  * The number of sampling points in the grid used to compute the output.
  * @param[in] aligned
- * A boolean value which determines whether to shift the boxes by 0.5 pixel.
+ * A Boolean value which determines whether to shift the boxes by 0.5 pixel.
  * @param[in] grads_image_desc
  * The descriptor of the tensor \b grads_image of the original images.
  * @param[out] grads_image
@@ -10884,7 +10884,7 @@ mluOpRoiAlignBackward(mluOpHandle_t handle,
  * @param[in] sampling_ratio
  * The number of sampling points in the grid used to compute the output.
  * @param[in] aligned
- * A boolean value which determines whether to shift the boxes by 0.5 pixel. If the value
+ * A Boolean value which determines whether to shift the boxes by 0.5 pixel. If the value
  * of \b aligned is set to true, the boxes are shifted by 0.5. If the value of \b aligned is set
  * to false, the boxes are not shifted.
  * @param[in] pool_mode
@@ -14284,6 +14284,24 @@ mluOpSetFFTReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, void *rese
  *   output is n/2 + 1 non-redundant complex numbers. This requires a padding of input array.
  * - For in-place N-D real-to-complex FFTs, extra padding of the real-data array on the innermost
  *   dimension is necessary to accommodate the size of the complex-data output.
+ * - For 2D FFTs, cases with strides that meet the following conditions have
+ *   better performance:
+ *     - real-to-complex:
+ *       - n[0] < 200, n[0] == inembed[0], onembed[0] == n[0]
+ *       - n[1] < 200, n[1] == inembed[1], onembed[1] == n[1]/2+1
+ *       - input: dims[batch, n0, n1], strides[1, batch*n1, batch]
+ *       - output: dims[batch, n0, n1/2+1], strides[1, batch*(n1/2+1), batch]
+ *     - complex-to-complex:
+ *       - n[0] < 200, n[0] == inembed[0], onembed[0] == n[0]
+ *       - n[1] < 200, n[1] == inembed[1], onembed[1] == n[1]
+ *       - input: dims[batch, n0, n1], strides[1, batch*n1, batch]
+ *       - output: dims[batch, n0, n1], strides[1, batch*n1, batch]
+ *     - complex-to-real:
+ *       - n[0] < 200, n[0] == inembed[0], onembed[0] == n[0]
+ *       - n[1] < 200, n[1]/2+1 == inembed[1], onembed[1] == n[1]
+ *       - input: dims[batch, n0, n1/2+1], strides[1, batch*(n1/2+1), batch]
+ *       - output: dims[batch, n0, n1], strides[1, batch*n1, batch]
+ *
  * - When \p input contains NaN or infinity and the input onchip data type of FFT is not quantized
  *   data type, the output is computed through the FFT formula with computation rules of NaN or
  *   infinity based on IEEE 754.