Skip to content

Commit

Permalink
[Fix](mluOpExecFFT): fix core dump, scale factor and one point comput…
Browse files Browse the repository at this point in the history
…e error
  • Loading branch information
niyuming committed Dec 5, 2024
1 parent c3cbdf0 commit 4fdd0b4
Show file tree
Hide file tree
Showing 12 changed files with 561 additions and 849 deletions.
260 changes: 145 additions & 115 deletions kernels/fft/c2c_fft/c2c_fft_host.cpp

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions kernels/fft/common/fft_basic_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,10 +495,10 @@ mluOpStatus_t fftGetBatchMatMulBcastWorkspaceSize(
cnnlMatMulHeuristicResult_t heuristic_result;
CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result));
int requested_algo_count = 1, return_algo_count = 0;
cnnlGetBatchMatMulAlgoHeuristic(
cnnlGetBatchMatMulExAlgoHeuristic(
cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
requested_algo_count, &heuristic_result, &return_algo_count);
cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
// destroy descriptor
// destroy cnnl descriptor
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
Expand Down Expand Up @@ -595,20 +595,20 @@ mluOpStatus_t fftBatchMatMulBcast(
alpha = 1.0;
beta = 0.0;
int requested_algo_count = 1, return_algo_count = 0;
cnnlGetBatchMatMulAlgoHeuristic(
cnnlGetBatchMatMulExAlgoHeuristic(
cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
requested_algo_count, &heuristic_result, &return_algo_count);
cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
if (workspace_size > 0) {
CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
} else {
CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
}

CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
&beta, cnnl_c_desc, c_ptr,
(void *)workspace, workspace_size));
CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
&beta, cnnl_c_desc, c_ptr,
(void *)workspace, workspace_size));
// destroy descriptor
// destroy cnnl descriptor
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
Expand Down
3 changes: 2 additions & 1 deletion kernels/fft/common/fft_common_kernels.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
__asm__ volatile(
"gather.clean.nram.nram.nram.b32.u32 "
"[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
[src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
[ src ] "r"(src_addr), [ offset ] "r"(offset_int_addr),
[ data_num ] "r"(deal_size));
#else
for (auto i = 0; i < deal_size; i++) {
dst_addr[i] = src_addr[offset_int_addr[i]];
Expand Down
114 changes: 89 additions & 25 deletions kernels/fft/fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1657,7 +1657,7 @@ mluOpAllocateC2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
fft_plan->is_batch_contiguous)
? 0
: buffer_size;
if (fft_plan->n[0] > fft_plan->inembed[0]) {
if (fft_plan->n[0] != fft_plan->inembed[0]) {
workspace_size += buffer_size;
}
size_t twiddles_size = in_c_dtype_size * nfft * 2;
Expand Down Expand Up @@ -1701,7 +1701,7 @@ mluOpAllocateR2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */

if (fft_plan->n[0] > fft_plan->inembed[0]) {
if (fft_plan->n[0] != fft_plan->inembed[0]) {
workspace_size += buffer_size; // input_pad_addr
}
fft_plan->workspace_size = workspace_size;
Expand All @@ -1721,32 +1721,36 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D(
size_t in_c_dtype_size = mluOpDataTypeBytes(in_c_dtype);

int batch = fft_plan->batch;
const int _n0 = fft_plan->n[0];
const int _n1 = fft_plan->n[1];
const int n0_ori = fft_plan->n[0];
const int n1_ori = fft_plan->n[1];

size_t buffer_size = batch * in_c_dtype_size * _n0 * _n1;
size_t buffer_size = batch * in_c_dtype_size * n0_ori * n1_ori;

size_t twiddles_size = in_c_dtype_size * _n0;
size_t twiddles_size_2d = in_c_dtype_size * _n1;
size_t twiddles_size = in_c_dtype_size * n0_ori;
size_t twiddles_size_2d = in_c_dtype_size * n1_ori;

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
reservespace_size =
(in_c_dtype_size * _n0 * _n0 + in_c_dtype_size * _n1 * _n1) *
2; /* DFT matrix */
reservespace_size = (in_c_dtype_size * n0_ori * n0_ori +
in_c_dtype_size * n1_ori * n1_ori) *
2; /* DFT matrix */
workspace_size = buffer_size * 6;
} else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

fft_plan->workspace_size = workspace_size;
if (fft_plan->n[0] > fft_plan->inembed[0] ||
fft_plan->n[1] > fft_plan->inembed[1]) {
if (fft_plan->n[0] != fft_plan->inembed[0] ||
fft_plan->n[1] != fft_plan->inembed[1]) {
fft_plan->workspace_size = workspace_size + buffer_size; // input_pad_addr
}
fft_plan->reservespace_size = reservespace_size;
Expand Down Expand Up @@ -1783,19 +1787,66 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */

if (fft_plan->n[0] > fft_plan->inembed[0]) {
if (fft_plan->n[0] != fft_plan->inembed[0]) {
workspace_size += buffer_size; // input_pad_addr
}
fft_plan->workspace_size = workspace_size;
fft_plan->reservespace_size = reservespace_size;

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateIRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
const int n0_ori, const int n1_ori) {
const std::string make_plan_api = "[mluOpAllocateIRFFT2D]";
size_t workspace_size = 0, reservespace_size = 0;

mluOpDataType_t out_c_dtype = fft_plan->output_dtype;
mluOpDataType_t in_c_dtype = fft_plan->input_dtype;
size_t complex_dtype_size =
(mluOpDataTypeBytes(out_c_dtype) > mluOpDataTypeBytes(in_c_dtype))
? mluOpDataTypeBytes(out_c_dtype)
: mluOpDataTypeBytes(in_c_dtype);

int batch = fft_plan->batch;
size_t buffer_size = batch * complex_dtype_size * n0_ori * n1_ori;

size_t twiddles_size = complex_dtype_size * n0_ori;
size_t twiddles_size_2d = complex_dtype_size * n1_ori;

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
reservespace_size =
complex_dtype_size * n0_ori * n0_ori * 2 +
complex_dtype_size * n1_ori * n1_ori * 2; /* DFT matrix */
workspace_size = complex_dtype_size * n1_ori * n0_ori * batch * 6;
} else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1)
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

if (fft_plan->n[0] != fft_plan->inembed[0] ||
fft_plan->n[1] != fft_plan->inembed[1]) {
workspace_size += buffer_size;
}
fft_plan->workspace_size = workspace_size;
fft_plan->reservespace_size = reservespace_size;

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
const int _n0, const int _n1) {
const int n0_ori, const int n1_ori) {
const std::string make_plan_api = "[mluOpAllocateRFFT2D]";
size_t workspace_size = 0, reservespace_size = 0;

Expand All @@ -1807,27 +1858,32 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
: mluOpDataTypeBytes(in_c_dtype);

int batch = fft_plan->batch;
size_t buffer_size = batch * complex_dtype_size * _n0 * _n1;
size_t buffer_size = batch * complex_dtype_size * n0_ori * n1_ori;

size_t twiddles_size = complex_dtype_size * _n0;
size_t twiddles_size_2d = complex_dtype_size * _n1;
size_t twiddles_size = complex_dtype_size * n0_ori;
size_t twiddles_size_2d = complex_dtype_size * n1_ori;

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
reservespace_size = complex_dtype_size * _n0 * _n0 * 2 +
complex_dtype_size * _n1 * _n1 * 2; /* DFT matrix */
workspace_size = complex_dtype_size * _n1 * _n0 * batch * 6;
reservespace_size =
complex_dtype_size * n0_ori * n0_ori * 2 +
complex_dtype_size * n1_ori * n1_ori * 2; /* DFT matrix */
workspace_size = complex_dtype_size * n1_ori * n0_ori * batch * 6;
} else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

if (fft_plan->n[0] > fft_plan->inembed[0] ||
fft_plan->n[1] > fft_plan->inembed[1]) {
if (fft_plan->n[0] != fft_plan->inembed[0] ||
fft_plan->n[1] != fft_plan->inembed[1]) {
workspace_size += buffer_size;
}
fft_plan->workspace_size = workspace_size;
Expand All @@ -1846,6 +1902,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2C1D(
const int rank, const int *n) {
fft_plan->is_batch_contiguous =
(fft_plan->idist == 1 && fft_plan->odist == 1 &&
fft_plan->inembed[0] == fft_plan->n[0] &&
fft_plan->onembed[0] == fft_plan->n[0] &&
fft_plan->istride == fft_plan->batch &&
fft_plan->ostride == fft_plan->batch) &&
(fft_plan->n[0] == fft_plan->inembed[0]);
Expand Down Expand Up @@ -2221,7 +2279,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2R2D(
fft_plan->fft_strategy = CNFFT_FUNC_TWO_LEVEL_STOCKHAM;
}

mluOpAllocateRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);
mluOpAllocateIRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
switch (fft_plan->fft_type) {
Expand Down Expand Up @@ -2394,6 +2452,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanMany(
fft_plan->inembed[i] = input_desc->dims[fft_plan->idim - rank + i];
fft_plan->onembed[i] = output_desc->dims[fft_plan->odim - rank + i];
}
for (auto i = 0; i < fft_plan->idim; i++) {
fft_plan->in_stride[i] = input_desc->strides[i];
}
for (auto i = 0; i < fft_plan->odim; i++) {
fft_plan->out_stride[i] = output_desc->strides[i];
}
if (fft_plan->idim == rank + 1) {
fft_plan->idist = input_desc->strides[0];
fft_plan->odist = output_desc->strides[0];
Expand Down
30 changes: 17 additions & 13 deletions kernels/fft/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ struct cnfftButterflyAddrs {
int *factors;
int *factors_2d;
void *input_pad_addr;
void *input_copy_workspace_addr;
void *output_copy_workspace_addr;
};
struct mluOpFFTStruct {
int rank; // rank of FFT
Expand All @@ -193,24 +195,26 @@ struct mluOpFFTStruct {
int inum; // element num of input tensor
int istride; // distance between two successive input elements in the
// innermost dimension
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int in_stride[FFT_DIM_MAX + 1];
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int onembed[FFT_DIM_MAX]; // Pointer of size rank that indicates the storage
// dimensions of the output data in memory
int onum; // element num of output tensor
int ostride; // distance between two successive output elements in the
// innermost dimension
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
int out_stride[FFT_DIM_MAX + 1];
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
bool is_input_contiguous;
bool is_output_contiguous;
bool is_batch_contiguous;
Expand Down
Loading

0 comments on commit 4fdd0b4

Please sign in to comment.