[Fix](mluOpSetFFTReserveArea): fix fft bug (#1120)

Co-authored-by: Chengwei Dong <[email protected]> Co-authored-by: dongchengwei <[email protected]> Co-authored-by: niyuming <[email protected]>
Cambricon · Oct 22, 2024 · c205e44 · c205e44
1 parent 22df809
commit c205e44
Show file tree

Hide file tree

Showing 3 changed files with 330 additions and 98 deletions.
diff --git a/kernels/fft/common/fft_common_kernels.mlu b/kernels/fft/common/fft_common_kernels.mlu
@@ -91,9 +91,8 @@ __mlu_func__ void genSinCosVec(float *src_addr, float *sin_addr,
  */
 __mlu_func__ void genSelectOffsetVec(float *offset_addr,
                                      int32_t *offset_int_addr, int deal_size) {
-  for (int i = 0; i < deal_size; i++) {
-    offset_int_addr[i] = (int)(offset_addr[i]);
-  }
+  __bang_mul_scalar(offset_addr, offset_addr, (float)sizeof(float), deal_size);
+  __bang_float2int32((int32_t *)offset_int_addr, offset_addr, deal_size, 0);
 }
 
 /*
@@ -106,9 +105,16 @@ __mlu_func__ void genSelectOffsetVec(float *offset_addr,
  */
 __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
                             float *dst_addr, int deal_size) {
+#if __BANG_ARCH__ >= 372 && __BANG_ARCH__ != 520
+  __asm__ volatile(
+      "gather.clean.nram.nram.nram.b32.u32 "
+      "[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
+      [src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
+#else
   for (auto i = 0; i < deal_size; i++) {
     dst_addr[i] = src_addr[offset_int_addr[i]];
   }
+#endif
 }
 
 /*
@@ -143,7 +149,7 @@ __mlu_func__ void generateRFFTHalfDFTMatrixImpl(int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos vectors
   const float scale = -2.0 * M_PI / n;
@@ -227,7 +233,7 @@ __mlu_func__ void generateRFFTFullDFTMatrixImpl(int row, int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos vectors
   const float scale = -2.0 * M_PI / n;
@@ -316,7 +322,7 @@ __mlu_func__ void generateIRFFTHalfDFTMatrixImpl(int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos coefficient vectors
   __bang_write_value((float *)cos_coeff_addr, deal_size, (float)2.0);
@@ -411,7 +417,7 @@ __mlu_func__ void generateIRFFTFullDFTMatrixImpl(int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos vectors
   const float scale = 2.0 * M_PI / n;
@@ -507,7 +513,7 @@ __mlu_func__ void generateC2CFFTDFTMatrixImpl(int n, void *output) {
   float *row_addr = temp_addr;
 
   // generate 0 to n indices
-  __mluop_get_indices(inc_addr, (float)0.0, deal_size);
+  __mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);
 
   // generate sin and cos vectors
   const float forward_scale = -2.0 * M_PI / n;

diff --git a/kernels/utils/common.h b/kernels/utils/common.h
@@ -25,6 +25,7 @@
 #ifndef KERNELS_UTILS_COMMON_H_
 #define KERNELS_UTILS_COMMON_H_
 
+#include <algorithm>
 #include <type_traits>
 
 #include "float.h"
@@ -74,9 +75,7 @@ __mlu_func__ void __mluop_float2half(half *dst, float *src, int src_count) {
   __bang_float2half_rn(dst, src, src_count);
 }
 
-__mlu_func__ half __mluop_float2half(float a) {
-  return __float2half_rn(a);
-}
+__mlu_func__ half __mluop_float2half(float a) { return __float2half_rn(a); }
 
 /******************************************************************************
  * MLUOP FUNC: __mluop_div
@@ -488,4 +487,213 @@ __mlu_vector__ void __mluop_get_indices(float *dst, float start_index,
   }
 }
 
+template <typename T>
+__mlu_func__ void __mlu_op_arange_base_(T *dst_nram, uint32_t numel,
+                                        T start_index, T step) {
+  for (uint32_t i = 0; i < numel; i++) {
+    dst_nram[i] = start_index + i * step;
+  }
+}
+
+#define MLUOP_ARANGE_VV_IMPL(VVType, vv_num, dst_nram, start_index, step) \
+  do {                                                                    \
+    VVType vv_index[8];                                                   \
+    __vv_index(vv_index[0], start_index, step);                           \
+    __vv_add(vv_index[1], vv_index[0], 1 * vv_num * step);                \
+    __vv_add(vv_index[2], vv_index[0], 2 * vv_num * step);                \
+    __vv_add(vv_index[3], vv_index[0], 3 * vv_num * step);                \
+    __vv_add(vv_index[4], vv_index[0], 4 * vv_num * step);                \
+    __vv_add(vv_index[5], vv_index[0], 5 * vv_num * step);                \
+    __vv_add(vv_index[6], vv_index[0], 6 * vv_num * step);                \
+    __vv_add(vv_index[7], vv_index[0], 7 * vv_num * step);                \
+    __vv_store(dst_nram, vv_index[0], vv_num);                            \
+    __vv_store(dst_nram + vv_num, vv_index[1], vv_num);                   \
+    __vv_store(dst_nram + 2 * vv_num, vv_index[2], vv_num);               \
+    __vv_store(dst_nram + 3 * vv_num, vv_index[3], vv_num);               \
+    __vv_store(dst_nram + 4 * vv_num, vv_index[4], vv_num);               \
+    __vv_store(dst_nram + 5 * vv_num, vv_index[5], vv_num);               \
+    __vv_store(dst_nram + 6 * vv_num, vv_index[6], vv_num);               \
+    __vv_store(dst_nram + 7 * vv_num, vv_index[7], vv_num);               \
+  } while (false)
+
+template <typename T>
+__mlu_vector__ void __mlu_op_arange_vv_(T *dst_nram, T start_index, T step) {
+#if 592 < _BANG_ARCH_
+  static_assert(
+      (std::is_same<T, float>::value || std::is_same<T, half>::value ||
+       std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value),
+      "__mlu_op_arange_vv type error!");
+#else  // #if 592 < _BANG_ARCH_
+  static_assert(
+      (std::is_same<T, float>::value || std::is_same<T, half>::value ||
+       std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value ||
+       std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value),
+      "__mlu_op_arange_vv type error!");
+#endif
+
+  const uint32_t vv_num = __vv_get_length() / sizeof(T);
+
+#if _BANG_ARCH_ <= 592
+  if (std::is_same<T, uint16_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_uint16, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, int16_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_int16, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, uint32_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_uint32, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, int32_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_int32, vv_num, dst_nram, start_index, step);
+  }
+#endif  // if _BANG_ARCH_ <= 592
+  if (std::is_same<T, uint16_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_uint16, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, int16_t>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_int16, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, float>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_float, vv_num, dst_nram, start_index, step);
+  } else if (std::is_same<T, half>::value) {
+    MLUOP_ARANGE_VV_IMPL(vv_half, vv_num, dst_nram, start_index, step);
+  }
+  return;
+}
+
+#if 592 < _BANG_ARCH_
+template <typename T>
+__mlu_func__ void __mlu_op_gen_integer_incr_seq_(T *dst_nram,
+                                                 uint32_t elem_count,
+                                                 T start = 0, T step = 1) {
+  static_assert(
+      (std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value ||
+       std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>),
+      "__mlu_op_gen_integer_incr_seq type error!");
+  if (std::is_same<T, uint32_t>::value) {
+    __bang_incseq(reinterpret_cast<int32_t *>(dst_nram), elem_count);
+  } else if (std::is_same<T, uint64_t>::value) {
+    __bang_incseq(reinterpret_cast<int64_t *>(dst_nram), elem_count);
+  } else {
+    __bang_incseq(dst_nram, elem_count);
+  }
+
+  if (start != 0) {
+    if (std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>::value) {
+      if (step != 1) {
+        __bang_mul_scalar(dst_nram, dst_nram, step, elem_count);
+      }
+      __bang_add_scalar(dst_nram, dst_nram, start, elem_count);
+    } else {
+      __bang_fusion(FUSION_FMA, dst_nram, dst_nram, step, start, elem_count);
+    }
+  }
+}
+#endif  // if 592 < _BANG_ARCH_
+
+#define u32_sizeof(T) ((uint32_t)sizeof(T))
+
+template <typename T>
+__mlu_func__ void __mlu_op_arange_by_expand_(T *dst_nram, uint32_t numel,
+                                             T start_index = 0, T step = 1) {
+#if 592 < _BANG_ARCH_
+  static_assert(
+      (std::is_same<T, float>::value || std::is_same<T, half>::value ||
+       std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value),
+      "__mlu_op_arange_by_expand type error!");
+#else   // if 592 < _BANG_ARCH_
+  static_assert(
+      (std::is_same<T, float>::value || std::is_same<T, half>::value ||
+       std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value ||
+       std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value ||
+       std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>::value),
+      "__mlu_op_arange_by_expand type error!");
+#endif  // if 592 < _BANG_ARCH_
+
+  // using AluGenSize = std::integral_constant<uint32_t, NFU_ALIGN_SIZE>;
+  using GuGenSize = std::integral_constant<uint32_t, 2048>;
+  uint32_t gu_gen_num = GuGenSize::value / u32_sizeof(T);
+  uint32_t alu_gen_num = NFU_ALIGN_SIZE / u32_sizeof(T);
+  uint32_t base_num = alu_gen_num;
+#if _BANG_ARCH_ <= 592
+  if (std::is_same<T, uint64_t>::value || std::is_same<T, int64_t>::value) {
+    const uint32_t prologue_num = std::min(numel, base_num);
+    __mlu_op_arange_base_(dst_nram, prologue_num, start_index, step);
+
+    if (numel <= base_num) {
+      return;
+    }
+  } else {
+    if (numel <= gu_gen_num) {
+      const uint32_t prologue_num = std::min(numel, base_num);
+      __mlu_op_arange_base_(dst_nram, prologue_num, start_index, step);
+
+      if (numel <= base_num) {
+        return;
+      }
+    } else {
+      __mlu_op_arange_vv_(dst_nram, start_index, step);
+      base_num = gu_gen_num;
+    }
+  }
+#else
+  if (numel <= gu_gen_num) {
+    const uint32_t prologue_num = std::min(numel, base_num);
+    __mlu_op_arange_base_(dst_nram, prologue_num, start_index, step);
+
+    if (numel <= base_num) {
+      return;
+    }
+  } else {
+    __mlu_op_arange_vv_(dst_nram, start_index, step);
+    base_num = gu_gen_num;
+  }
+#endif
+  // base_num = 2^exp
+  uint32_t exp = 0;
+  asm volatile("findlast1.gpr.b32 %[dst], %[src];\n\t"
+               : [ dst ] "+&r"(exp)
+               : [ src ] "r"(base_num));
+  // numel = count * base_num + remain
+  const uint32_t segnum = numel >> exp;
+  // count = 2^repeat
+  uint32_t repeat = 0;
+  asm volatile("findlast1.gpr.b32 %[dst], %[src];\n\t"
+               : [ dst ] "+&r"(repeat)
+               : [ src ] "r"(segnum));
+  uint32_t count = 1;
+  for (uint32_t i = 0; i < repeat; ++i) {
+    __bang_add_scalar(dst_nram + count * base_num, dst_nram,
+                      count * base_num * step, count * base_num);
+    count *= 2;
+  }
+
+  const uint32_t remain = numel - count * base_num;
+  if (0 < remain) {
+    __bang_add_scalar(dst_nram + count * base_num, dst_nram,
+                      count * base_num * step, remain);
+  }
+}
+/***************************************************************************
+
+    CNNL FUNC: __mlu_op_gen_stage_index.
+    param "dst_nram" is a nram pointer to the generated result.
+    param "numel" is the element number of to be generated.
+    param "start_index" is the starting value for the set of points. Default: 0.
+    param "step" is the gap between each pair of adjacent points points.
+   Default: 1. dst_addition. remarks: Detailed introduction for reference
+    http://wiki.cambricon.com/pages/viewpage.action?pageId=119467501.
+    int64_t and uint64_t types are under-optimized and can be improved with GU.
+    *************************************************************************/
+
+template <typename T>
+__mlu_func__ void __mlu_op_gen_stage_index(T *dst_nram, uint32_t numel,
+                                           T start_index = 0, T step = 1) {
+#if 592 < _BANG_ARCH_
+  if (std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value ||
+      std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>::value) {
+    __mlu_op_gen_integer_incr_seq_(dst_nram, numel, start_index, step);
+  } else {
+    __mlu_op_arange_by_expand_(dst_nram, numel, start_index, step);
+  }
+#else
+  __mlu_op_arange_by_expand_(dst_nram, numel, start_index, step);
+#endif
+}
+
 #endif  // KERNELS_UTILS_COMMON_H_