Skip to content

Commit

Permalink
[Fix](mluOpSetFFTReserveArea): fix fft bug (#1120)
Browse files Browse the repository at this point in the history
Co-authored-by: Chengwei Dong <[email protected]>
Co-authored-by: dongchengwei <[email protected]>
Co-authored-by: niyuming <[email protected]>
  • Loading branch information
4 people authored Oct 22, 2024
1 parent 22df809 commit c205e44
Show file tree
Hide file tree
Showing 3 changed files with 330 additions and 98 deletions.
22 changes: 14 additions & 8 deletions kernels/fft/common/fft_common_kernels.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,8 @@ __mlu_func__ void genSinCosVec(float *src_addr, float *sin_addr,
*/
__mlu_func__ void genSelectOffsetVec(float *offset_addr,
int32_t *offset_int_addr, int deal_size) {
for (int i = 0; i < deal_size; i++) {
offset_int_addr[i] = (int)(offset_addr[i]);
}
__bang_mul_scalar(offset_addr, offset_addr, (float)sizeof(float), deal_size);
__bang_float2int32((int32_t *)offset_int_addr, offset_addr, deal_size, 0);
}

/*
Expand All @@ -106,9 +105,16 @@ __mlu_func__ void genSelectOffsetVec(float *offset_addr,
*/
__mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
float *dst_addr, int deal_size) {
#if __BANG_ARCH__ >= 372 && __BANG_ARCH__ != 520
__asm__ volatile(
"gather.clean.nram.nram.nram.b32.u32 "
"[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
[src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
#else
for (auto i = 0; i < deal_size; i++) {
dst_addr[i] = src_addr[offset_int_addr[i]];
}
#endif
}

/*
Expand Down Expand Up @@ -143,7 +149,7 @@ __mlu_func__ void generateRFFTHalfDFTMatrixImpl(int n, void *output) {
float *row_addr = temp_addr;

// generate 0 to n indices
__mluop_get_indices(inc_addr, (float)0.0, deal_size);
__mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);

// generate sin and cos vectors
const float scale = -2.0 * M_PI / n;
Expand Down Expand Up @@ -227,7 +233,7 @@ __mlu_func__ void generateRFFTFullDFTMatrixImpl(int row, int n, void *output) {
float *row_addr = temp_addr;

// generate 0 to n indices
__mluop_get_indices(inc_addr, (float)0.0, deal_size);
__mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);

// generate sin and cos vectors
const float scale = -2.0 * M_PI / n;
Expand Down Expand Up @@ -316,7 +322,7 @@ __mlu_func__ void generateIRFFTHalfDFTMatrixImpl(int n, void *output) {
float *row_addr = temp_addr;

// generate 0 to n indices
__mluop_get_indices(inc_addr, (float)0.0, deal_size);
__mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);

// generate sin and cos coefficient vectors
__bang_write_value((float *)cos_coeff_addr, deal_size, (float)2.0);
Expand Down Expand Up @@ -411,7 +417,7 @@ __mlu_func__ void generateIRFFTFullDFTMatrixImpl(int n, void *output) {
float *row_addr = temp_addr;

// generate 0 to n indices
__mluop_get_indices(inc_addr, (float)0.0, deal_size);
__mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);

// generate sin and cos vectors
const float scale = 2.0 * M_PI / n;
Expand Down Expand Up @@ -507,7 +513,7 @@ __mlu_func__ void generateC2CFFTDFTMatrixImpl(int n, void *output) {
float *row_addr = temp_addr;

// generate 0 to n indices
__mluop_get_indices(inc_addr, (float)0.0, deal_size);
__mlu_op_gen_stage_index(inc_addr, deal_size, 0.0f, 1.0f);

// generate sin and cos vectors
const float forward_scale = -2.0 * M_PI / n;
Expand Down
214 changes: 211 additions & 3 deletions kernels/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#ifndef KERNELS_UTILS_COMMON_H_
#define KERNELS_UTILS_COMMON_H_

#include <algorithm>
#include <type_traits>

#include "float.h"
Expand Down Expand Up @@ -74,9 +75,7 @@ __mlu_func__ void __mluop_float2half(half *dst, float *src, int src_count) {
__bang_float2half_rn(dst, src, src_count);
}

__mlu_func__ half __mluop_float2half(float a) {
return __float2half_rn(a);
}
__mlu_func__ half __mluop_float2half(float a) { return __float2half_rn(a); }

/******************************************************************************
 * MLUOP FUNC: __mluop_div
Expand Down Expand Up @@ -488,4 +487,213 @@ __mlu_vector__ void __mluop_get_indices(float *dst, float start_index,
}
}

template <typename T>
__mlu_func__ void __mlu_op_arange_base_(T *dst_nram, uint32_t numel,
T start_index, T step) {
for (uint32_t i = 0; i < numel; i++) {
dst_nram[i] = start_index + i * step;
}
}

#define MLUOP_ARANGE_VV_IMPL(VVType, vv_num, dst_nram, start_index, step) \
do { \
VVType vv_index[8]; \
__vv_index(vv_index[0], start_index, step); \
__vv_add(vv_index[1], vv_index[0], 1 * vv_num * step); \
__vv_add(vv_index[2], vv_index[0], 2 * vv_num * step); \
__vv_add(vv_index[3], vv_index[0], 3 * vv_num * step); \
__vv_add(vv_index[4], vv_index[0], 4 * vv_num * step); \
__vv_add(vv_index[5], vv_index[0], 5 * vv_num * step); \
__vv_add(vv_index[6], vv_index[0], 6 * vv_num * step); \
__vv_add(vv_index[7], vv_index[0], 7 * vv_num * step); \
__vv_store(dst_nram, vv_index[0], vv_num); \
__vv_store(dst_nram + vv_num, vv_index[1], vv_num); \
__vv_store(dst_nram + 2 * vv_num, vv_index[2], vv_num); \
__vv_store(dst_nram + 3 * vv_num, vv_index[3], vv_num); \
__vv_store(dst_nram + 4 * vv_num, vv_index[4], vv_num); \
__vv_store(dst_nram + 5 * vv_num, vv_index[5], vv_num); \
__vv_store(dst_nram + 6 * vv_num, vv_index[6], vv_num); \
__vv_store(dst_nram + 7 * vv_num, vv_index[7], vv_num); \
} while (false)

template <typename T>
__mlu_vector__ void __mlu_op_arange_vv_(T *dst_nram, T start_index, T step) {
#if 592 < _BANG_ARCH_
static_assert(
(std::is_same<T, float>::value || std::is_same<T, half>::value ||
std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value),
"__mlu_op_arange_vv type error!");
#else // #if 592 < _BANG_ARCH_
static_assert(
(std::is_same<T, float>::value || std::is_same<T, half>::value ||
std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value ||
std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value),
"__mlu_op_arange_vv type error!");
#endif

const uint32_t vv_num = __vv_get_length() / sizeof(T);

#if _BANG_ARCH_ <= 592
if (std::is_same<T, uint16_t>::value) {
MLUOP_ARANGE_VV_IMPL(vv_uint16, vv_num, dst_nram, start_index, step);
} else if (std::is_same<T, int16_t>::value) {
MLUOP_ARANGE_VV_IMPL(vv_int16, vv_num, dst_nram, start_index, step);
} else if (std::is_same<T, uint32_t>::value) {
MLUOP_ARANGE_VV_IMPL(vv_uint32, vv_num, dst_nram, start_index, step);
} else if (std::is_same<T, int32_t>::value) {
MLUOP_ARANGE_VV_IMPL(vv_int32, vv_num, dst_nram, start_index, step);
}
#endif // if _BANG_ARCH_ <= 592
if (std::is_same<T, uint16_t>::value) {
MLUOP_ARANGE_VV_IMPL(vv_uint16, vv_num, dst_nram, start_index, step);
} else if (std::is_same<T, int16_t>::value) {
MLUOP_ARANGE_VV_IMPL(vv_int16, vv_num, dst_nram, start_index, step);
} else if (std::is_same<T, float>::value) {
MLUOP_ARANGE_VV_IMPL(vv_float, vv_num, dst_nram, start_index, step);
} else if (std::is_same<T, half>::value) {
MLUOP_ARANGE_VV_IMPL(vv_half, vv_num, dst_nram, start_index, step);
}
return;
}

#if 592 < _BANG_ARCH_
template <typename T>
__mlu_func__ void __mlu_op_gen_integer_incr_seq_(T *dst_nram,
uint32_t elem_count,
T start = 0, T step = 1) {
static_assert(
(std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value ||
std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>),
"__mlu_op_gen_integer_incr_seq type error!");
if (std::is_same<T, uint32_t>::value) {
__bang_incseq(reinterpret_cast<int32_t *>(dst_nram), elem_count);
} else if (std::is_same<T, uint64_t>::value) {
__bang_incseq(reinterpret_cast<int64_t *>(dst_nram), elem_count);
} else {
__bang_incseq(dst_nram, elem_count);
}

if (start != 0) {
if (std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>::value) {
if (step != 1) {
__bang_mul_scalar(dst_nram, dst_nram, step, elem_count);
}
__bang_add_scalar(dst_nram, dst_nram, start, elem_count);
} else {
__bang_fusion(FUSION_FMA, dst_nram, dst_nram, step, start, elem_count);
}
}
}
#endif // if 592 < _BANG_ARCH_

#define u32_sizeof(T) ((uint32_t)sizeof(T))

template <typename T>
__mlu_func__ void __mlu_op_arange_by_expand_(T *dst_nram, uint32_t numel,
T start_index = 0, T step = 1) {
#if 592 < _BANG_ARCH_
static_assert(
(std::is_same<T, float>::value || std::is_same<T, half>::value ||
std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value),
"__mlu_op_arange_by_expand type error!");
#else // if 592 < _BANG_ARCH_
static_assert(
(std::is_same<T, float>::value || std::is_same<T, half>::value ||
std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value ||
std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value ||
std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>::value),
"__mlu_op_arange_by_expand type error!");
#endif // if 592 < _BANG_ARCH_

// using AluGenSize = std::integral_constant<uint32_t, NFU_ALIGN_SIZE>;
using GuGenSize = std::integral_constant<uint32_t, 2048>;
uint32_t gu_gen_num = GuGenSize::value / u32_sizeof(T);
uint32_t alu_gen_num = NFU_ALIGN_SIZE / u32_sizeof(T);
uint32_t base_num = alu_gen_num;
#if _BANG_ARCH_ <= 592
if (std::is_same<T, uint64_t>::value || std::is_same<T, int64_t>::value) {
const uint32_t prologue_num = std::min(numel, base_num);
__mlu_op_arange_base_(dst_nram, prologue_num, start_index, step);

if (numel <= base_num) {
return;
}
} else {
if (numel <= gu_gen_num) {
const uint32_t prologue_num = std::min(numel, base_num);
__mlu_op_arange_base_(dst_nram, prologue_num, start_index, step);

if (numel <= base_num) {
return;
}
} else {
__mlu_op_arange_vv_(dst_nram, start_index, step);
base_num = gu_gen_num;
}
}
#else
if (numel <= gu_gen_num) {
const uint32_t prologue_num = std::min(numel, base_num);
__mlu_op_arange_base_(dst_nram, prologue_num, start_index, step);

if (numel <= base_num) {
return;
}
} else {
__mlu_op_arange_vv_(dst_nram, start_index, step);
base_num = gu_gen_num;
}
#endif
// base_num = 2^exp
uint32_t exp = 0;
asm volatile("findlast1.gpr.b32 %[dst], %[src];\n\t"
: [ dst ] "+&r"(exp)
: [ src ] "r"(base_num));
// numel = count * base_num + remain
const uint32_t segnum = numel >> exp;
// count = 2^repeat
uint32_t repeat = 0;
asm volatile("findlast1.gpr.b32 %[dst], %[src];\n\t"
: [ dst ] "+&r"(repeat)
: [ src ] "r"(segnum));
uint32_t count = 1;
for (uint32_t i = 0; i < repeat; ++i) {
__bang_add_scalar(dst_nram + count * base_num, dst_nram,
count * base_num * step, count * base_num);
count *= 2;
}

const uint32_t remain = numel - count * base_num;
if (0 < remain) {
__bang_add_scalar(dst_nram + count * base_num, dst_nram,
count * base_num * step, remain);
}
}
/***************************************************************************
CNNL FUNC: __mlu_op_gen_stage_index.
param "dst_nram" is a nram pointer to the generated result.
param "numel" is the element number of to be generated.
param "start_index" is the starting value for the set of points. Default: 0.
param "step" is the gap between each pair of adjacent points points.
Default: 1. dst_addition. remarks: Detailed introduction for reference
http://wiki.cambricon.com/pages/viewpage.action?pageId=119467501.
int64_t and uint64_t types are under-optimized and can be improved with GU.
*************************************************************************/

template <typename T>
__mlu_func__ void __mlu_op_gen_stage_index(T *dst_nram, uint32_t numel,
T start_index = 0, T step = 1) {
#if 592 < _BANG_ARCH_
if (std::is_same<T, int32_t>::value || std::is_same<T, uint32_t>::value ||
std::is_same<T, int64_t>::value || std::is_same<T, uint64_t>::value) {
__mlu_op_gen_integer_incr_seq_(dst_nram, numel, start_index, step);
} else {
__mlu_op_arange_by_expand_(dst_nram, numel, start_index, step);
}
#else
__mlu_op_arange_by_expand_(dst_nram, numel, start_index, step);
#endif
}

#endif // KERNELS_UTILS_COMMON_H_
Loading

0 comments on commit c205e44

Please sign in to comment.