Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix](mluOpExecFFT): fix core dump, scale factor and one point comput… #1174

Merged
merged 1 commit into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
260 changes: 145 additions & 115 deletions kernels/fft/c2c_fft/c2c_fft_host.cpp

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions kernels/fft/common/fft_basic_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,10 +495,10 @@ mluOpStatus_t fftGetBatchMatMulBcastWorkspaceSize(
cnnlMatMulHeuristicResult_t heuristic_result;
CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result));
int requested_algo_count = 1, return_algo_count = 0;
cnnlGetBatchMatMulAlgoHeuristic(
cnnlGetBatchMatMulExAlgoHeuristic(
cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
requested_algo_count, &heuristic_result, &return_algo_count);
cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
// destroy descriptor
// destroy cnnl descriptor
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
Expand Down Expand Up @@ -595,20 +595,20 @@ mluOpStatus_t fftBatchMatMulBcast(
alpha = 1.0;
beta = 0.0;
int requested_algo_count = 1, return_algo_count = 0;
cnnlGetBatchMatMulAlgoHeuristic(
cnnlGetBatchMatMulExAlgoHeuristic(
cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
requested_algo_count, &heuristic_result, &return_algo_count);
cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
if (workspace_size > 0) {
CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
} else {
CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
}

CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
&beta, cnnl_c_desc, c_ptr,
(void *)workspace, workspace_size));
CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
&beta, cnnl_c_desc, c_ptr,
(void *)workspace, workspace_size));
// destroy descriptor
// destroy cnnl descriptor
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
Expand Down
3 changes: 2 additions & 1 deletion kernels/fft/common/fft_common_kernels.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
__asm__ volatile(
"gather.clean.nram.nram.nram.b32.u32 "
"[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
[src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
[ src ] "r"(src_addr), [ offset ] "r"(offset_int_addr),
[ data_num ] "r"(deal_size));
#else
for (auto i = 0; i < deal_size; i++) {
dst_addr[i] = src_addr[offset_int_addr[i]];
Expand Down
114 changes: 89 additions & 25 deletions kernels/fft/fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1657,7 +1657,7 @@ mluOpAllocateC2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
fft_plan->is_batch_contiguous)
? 0
: buffer_size;
if (fft_plan->n[0] > fft_plan->inembed[0]) {
if (fft_plan->n[0] != fft_plan->inembed[0]) {
workspace_size += buffer_size;
}
size_t twiddles_size = in_c_dtype_size * nfft * 2;
Expand Down Expand Up @@ -1701,7 +1701,7 @@ mluOpAllocateR2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */

if (fft_plan->n[0] > fft_plan->inembed[0]) {
if (fft_plan->n[0] != fft_plan->inembed[0]) {
workspace_size += buffer_size; // input_pad_addr
}
fft_plan->workspace_size = workspace_size;
Expand All @@ -1721,32 +1721,36 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D(
size_t in_c_dtype_size = mluOpDataTypeBytes(in_c_dtype);

int batch = fft_plan->batch;
const int _n0 = fft_plan->n[0];
const int _n1 = fft_plan->n[1];
const int n0_ori = fft_plan->n[0];
const int n1_ori = fft_plan->n[1];

size_t buffer_size = batch * in_c_dtype_size * _n0 * _n1;
size_t buffer_size = batch * in_c_dtype_size * n0_ori * n1_ori;

size_t twiddles_size = in_c_dtype_size * _n0;
size_t twiddles_size_2d = in_c_dtype_size * _n1;
size_t twiddles_size = in_c_dtype_size * n0_ori;
size_t twiddles_size_2d = in_c_dtype_size * n1_ori;

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
reservespace_size =
(in_c_dtype_size * _n0 * _n0 + in_c_dtype_size * _n1 * _n1) *
2; /* DFT matrix */
reservespace_size = (in_c_dtype_size * n0_ori * n0_ori +
in_c_dtype_size * n1_ori * n1_ori) *
2; /* DFT matrix */
workspace_size = buffer_size * 6;
} else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

fft_plan->workspace_size = workspace_size;
if (fft_plan->n[0] > fft_plan->inembed[0] ||
fft_plan->n[1] > fft_plan->inembed[1]) {
if (fft_plan->n[0] != fft_plan->inembed[0] ||
fft_plan->n[1] != fft_plan->inembed[1]) {
fft_plan->workspace_size = workspace_size + buffer_size; // input_pad_addr
}
fft_plan->reservespace_size = reservespace_size;
Expand Down Expand Up @@ -1783,19 +1787,66 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */

if (fft_plan->n[0] > fft_plan->inembed[0]) {
if (fft_plan->n[0] != fft_plan->inembed[0]) {
workspace_size += buffer_size; // input_pad_addr
}
fft_plan->workspace_size = workspace_size;
fft_plan->reservespace_size = reservespace_size;

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateIRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
const int n0_ori, const int n1_ori) {
const std::string make_plan_api = "[mluOpAllocateIRFFT2D]";
size_t workspace_size = 0, reservespace_size = 0;

mluOpDataType_t out_c_dtype = fft_plan->output_dtype;
mluOpDataType_t in_c_dtype = fft_plan->input_dtype;
size_t complex_dtype_size =
(mluOpDataTypeBytes(out_c_dtype) > mluOpDataTypeBytes(in_c_dtype))
? mluOpDataTypeBytes(out_c_dtype)
: mluOpDataTypeBytes(in_c_dtype);

int batch = fft_plan->batch;
size_t buffer_size = batch * complex_dtype_size * n0_ori * n1_ori;

size_t twiddles_size = complex_dtype_size * n0_ori;
size_t twiddles_size_2d = complex_dtype_size * n1_ori;

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
reservespace_size =
complex_dtype_size * n0_ori * n0_ori * 2 +
complex_dtype_size * n1_ori * n1_ori * 2; /* DFT matrix */
workspace_size = complex_dtype_size * n1_ori * n0_ori * batch * 6;
} else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1)
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

if (fft_plan->n[0] != fft_plan->inembed[0] ||
fft_plan->n[1] != fft_plan->inembed[1]) {
workspace_size += buffer_size;
}
fft_plan->workspace_size = workspace_size;
fft_plan->reservespace_size = reservespace_size;

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
const int _n0, const int _n1) {
const int n0_ori, const int n1_ori) {
const std::string make_plan_api = "[mluOpAllocateRFFT2D]";
size_t workspace_size = 0, reservespace_size = 0;

Expand All @@ -1807,27 +1858,32 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
: mluOpDataTypeBytes(in_c_dtype);

int batch = fft_plan->batch;
size_t buffer_size = batch * complex_dtype_size * _n0 * _n1;
size_t buffer_size = batch * complex_dtype_size * n0_ori * n1_ori;

size_t twiddles_size = complex_dtype_size * _n0;
size_t twiddles_size_2d = complex_dtype_size * _n1;
size_t twiddles_size = complex_dtype_size * n0_ori;
size_t twiddles_size_2d = complex_dtype_size * n1_ori;

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
reservespace_size = complex_dtype_size * _n0 * _n0 * 2 +
complex_dtype_size * _n1 * _n1 * 2; /* DFT matrix */
workspace_size = complex_dtype_size * _n1 * _n0 * batch * 6;
reservespace_size =
complex_dtype_size * n0_ori * n0_ori * 2 +
complex_dtype_size * n1_ori * n1_ori * 2; /* DFT matrix */
workspace_size = complex_dtype_size * n1_ori * n0_ori * batch * 6;
} else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

if (fft_plan->n[0] > fft_plan->inembed[0] ||
fft_plan->n[1] > fft_plan->inembed[1]) {
if (fft_plan->n[0] != fft_plan->inembed[0] ||
fft_plan->n[1] != fft_plan->inembed[1]) {
workspace_size += buffer_size;
}
fft_plan->workspace_size = workspace_size;
Expand All @@ -1846,6 +1902,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2C1D(
const int rank, const int *n) {
fft_plan->is_batch_contiguous =
(fft_plan->idist == 1 && fft_plan->odist == 1 &&
fft_plan->inembed[0] == fft_plan->n[0] &&
fft_plan->onembed[0] == fft_plan->n[0] &&
fft_plan->istride == fft_plan->batch &&
fft_plan->ostride == fft_plan->batch) &&
(fft_plan->n[0] == fft_plan->inembed[0]);
Expand Down Expand Up @@ -2221,7 +2279,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2R2D(
fft_plan->fft_strategy = CNFFT_FUNC_TWO_LEVEL_STOCKHAM;
}

mluOpAllocateRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);
mluOpAllocateIRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
switch (fft_plan->fft_type) {
Expand Down Expand Up @@ -2394,6 +2452,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanMany(
fft_plan->inembed[i] = input_desc->dims[fft_plan->idim - rank + i];
fft_plan->onembed[i] = output_desc->dims[fft_plan->odim - rank + i];
}
for (auto i = 0; i < fft_plan->idim; i++) {
fft_plan->in_stride[i] = input_desc->strides[i];
}
for (auto i = 0; i < fft_plan->odim; i++) {
fft_plan->out_stride[i] = output_desc->strides[i];
}
if (fft_plan->idim == rank + 1) {
fft_plan->idist = input_desc->strides[0];
fft_plan->odist = output_desc->strides[0];
Expand Down
30 changes: 17 additions & 13 deletions kernels/fft/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ struct cnfftButterflyAddrs {
int *factors;
int *factors_2d;
void *input_pad_addr;
void *input_copy_workspace_addr;
void *output_copy_workspace_addr;
};
struct mluOpFFTStruct {
int rank; // rank of FFT
Expand All @@ -193,24 +195,26 @@ struct mluOpFFTStruct {
int inum; // element num of input tensor
int istride; // distance between two successive input elements in the
// innermost dimension
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int in_stride[FFT_DIM_MAX + 1];
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int onembed[FFT_DIM_MAX]; // Pointer of size rank that indicates the storage
// dimensions of the output data in memory
int onum; // element num of output tensor
int ostride; // distance between two successive output elements in the
// innermost dimension
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
int out_stride[FFT_DIM_MAX + 1];
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
bool is_input_contiguous;
bool is_output_contiguous;
bool is_batch_contiguous;
Expand Down
Loading