Skip to content

Commit

Permalink
[Fix](mluOpExecFFT): fix core dump, scale factor and one point comput…
Browse files Browse the repository at this point in the history
…e error
  • Loading branch information
niyuming committed Dec 5, 2024
1 parent 1725280 commit 7ec2c6b
Show file tree
Hide file tree
Showing 12 changed files with 348 additions and 682 deletions.
78 changes: 43 additions & 35 deletions kernels/fft/c2c_fft/c2c_fft_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,9 @@ static void configureFFT2dWorkspaceAddrs(mluOpHandle_t handle,
fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset;
offset += batch * in_c_dtype_size * _n0 * _n1 * 2;

if (fft_plan->is_input_contiguous) {
if ((fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])) {
fft_plan->mlu_addrs.input = input;
} else {
fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset;
Expand Down Expand Up @@ -1180,9 +1182,11 @@ static mluOpStatus_t makeFFT2dContiguousInput(mluOpHandle_t handle,
int64_t dims[in_dim_num] = {fft_plan->batch,
std::min(fft_plan->n[0], fft_plan->inembed[0]),
std::min(fft_plan->n[1], fft_plan->inembed[1])};
int64_t strides[in_dim_num] = {fft_plan->idist,
(fft_plan->istride * fft_plan->inembed[1]),
fft_plan->istride};

int64_t strides[3]; // in_dim_num
for (int i = 0; i < in_dim_num; i++) {
strides[i] = fft_plan->in_stride[i];
}
status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
fft_plan->input_dtype, in_dim_num,
dims, strides);
Expand Down Expand Up @@ -1779,17 +1783,8 @@ static mluOpStatus_t makeFFT1dContiguousOutput(mluOpHandle_t handle,
cnnl_copy_src_desc);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
cnnl_copy_dst_desc);
size_t workspace_size = 0;
CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
cnnl_copy_dst_desc, &workspace_size));

void *workspace = nullptr;
if (workspace_size > 0) {
CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
}
CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
cnnl_copy_dst_desc, output, workspace,
workspace_size));
cnnl_copy_dst_desc, output, NULL, 0));
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
DESTROY_CNNL_HANDLE(cnnl_handle);
Expand Down Expand Up @@ -1818,9 +1813,10 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle,
const int out_dim_num = 3;
int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
fft_plan->n[1]};
int64_t strides[out_dim_num] = {fft_plan->odist,
fft_plan->ostride * fft_plan->onembed[1],
fft_plan->ostride};
int64_t strides[3]; // out_dim_num
for (int i = 0; i < out_dim_num; i++) {
strides[i] = fft_plan->out_stride[i];
}
status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
out_c_dtype, out_dim_num, dims);
INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
Expand All @@ -1838,18 +1834,8 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle,
cnnl_copy_src_desc);
DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
cnnl_copy_dst_desc);

size_t workspace_size = 0;
CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
cnnl_copy_dst_desc, &workspace_size));

void *workspace = nullptr;
if (workspace_size > 0) {
CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
}
CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
cnnl_copy_dst_desc, output, workspace,
workspace_size));
cnnl_copy_dst_desc, output, NULL, 0));

DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
Expand Down Expand Up @@ -2053,7 +2039,29 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr;
}

status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) {
mluOpTensorDescriptor_t c_desc = nullptr;
status = mluOpCreateTensorDescriptor(&c_desc);
const int out_dim_num = 3;
int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
fft_plan->n[1]};
status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
fft_plan->output_dtype, 2, dims);
status = mluOpSetTensorDescriptorOnchipDataType(c_desc,
fft_plan->execution_dtype);

DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
cnnl_handle); // convert to cnnl_handle

DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc);
CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_output_desc,
fft_plan->mlu_addrs.input, cnnl_output_desc,
fft_plan->mlu_addrs.output, NULL, 0));
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
DESTROY_CNNL_HANDLE(cnnl_handle);
} else {
status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
}

INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);

Expand Down Expand Up @@ -2296,22 +2304,22 @@ mluOpStatus_t computeFFT2dMatMulRow(mluOpHandle_t handle,
int requested_algo_count = 1, return_algo_count = 0;
float *workspace;
size_t workspace_size;
cnnlGetBatchMatMulAlgoHeuristic(
cnnlGetBatchMatMulExAlgoHeuristic(
cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
requested_algo_count, &heuristic_result, &return_algo_count);

cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);

if (workspace_size > 0) {
CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
} else {
CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
}

CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
in_addr, &beta, cnnl_c_desc, out_addr,
(void *)workspace, workspace_size));
CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
in_addr, &beta, cnnl_c_desc, out_addr,
(void *)workspace, workspace_size));
// destroy cnnl descriptor
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);
Expand Down
16 changes: 8 additions & 8 deletions kernels/fft/common/fft_basic_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -488,10 +488,10 @@ mluOpStatus_t fftGetBatchMatMulBcastWorkspaceSize(
cnnlMatMulHeuristicResult_t heuristic_result;
CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result));
int requested_algo_count = 1, return_algo_count = 0;
cnnlGetBatchMatMulAlgoHeuristic(
cnnlGetBatchMatMulExAlgoHeuristic(
cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
requested_algo_count, &heuristic_result, &return_algo_count);
cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
// destroy descriptor
// destroy cnnl descriptor
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
Expand Down Expand Up @@ -585,20 +585,20 @@ mluOpStatus_t fftBatchMatMulBcast(
alpha = 1.0;
beta = 0.0;
int requested_algo_count = 1, return_algo_count = 0;
cnnlGetBatchMatMulAlgoHeuristic(
cnnlGetBatchMatMulExAlgoHeuristic(
cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
requested_algo_count, &heuristic_result, &return_algo_count);
cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
if (workspace_size > 0) {
CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
} else {
CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
}

CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
&beta, cnnl_c_desc, c_ptr,
(void *)workspace, workspace_size));
CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
&beta, cnnl_c_desc, c_ptr,
(void *)workspace, workspace_size));
// destroy descriptor
// destroy cnnl descriptor
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
Expand Down
3 changes: 2 additions & 1 deletion kernels/fft/common/fft_common_kernels.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
__asm__ volatile(
"gather.clean.nram.nram.nram.b32.u32 "
"[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
[src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
[ src ] "r"(src_addr), [ offset ] "r"(offset_int_addr),
[ data_num ] "r"(deal_size));
#else
for (auto i = 0; i < deal_size; i++) {
dst_addr[i] = src_addr[offset_int_addr[i]];
Expand Down
82 changes: 72 additions & 10 deletions kernels/fft/fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1657,7 +1657,7 @@ mluOpAllocateC2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
fft_plan->is_batch_contiguous)
? 0
: buffer_size;
if (fft_plan->n[0] > fft_plan->inembed[0]) {
if (fft_plan->n[0] != fft_plan->inembed[0]) {
workspace_size += buffer_size;
}
size_t twiddles_size = in_c_dtype_size * nfft * 2;
Expand Down Expand Up @@ -1701,7 +1701,7 @@ mluOpAllocateR2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */

if (fft_plan->n[0] > fft_plan->inembed[0]) {
if (fft_plan->n[0] != fft_plan->inembed[0]) {
workspace_size += buffer_size; // input_pad_addr
}
fft_plan->workspace_size = workspace_size;
Expand Down Expand Up @@ -1740,13 +1740,17 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D(
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

fft_plan->workspace_size = workspace_size;
if (fft_plan->n[0] > fft_plan->inembed[0] ||
fft_plan->n[1] > fft_plan->inembed[1]) {
if (fft_plan->n[0] != fft_plan->inembed[0] ||
fft_plan->n[1] != fft_plan->inembed[1]) {
fft_plan->workspace_size = workspace_size + buffer_size; // input_pad_addr
}
fft_plan->reservespace_size = reservespace_size;
Expand Down Expand Up @@ -1783,15 +1787,61 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */

if (fft_plan->n[0] > fft_plan->inembed[0]) {
if (fft_plan->n[0] != fft_plan->inembed[0]) {
workspace_size += buffer_size; // input_pad_addr
}
fft_plan->workspace_size = workspace_size;
fft_plan->reservespace_size = reservespace_size;

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateIRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
const int _n0, const int _n1) {
const std::string make_plan_api = "[mluOpAllocateIRFFT2D]";
size_t workspace_size = 0, reservespace_size = 0;

mluOpDataType_t out_c_dtype = fft_plan->output_dtype;
mluOpDataType_t in_c_dtype = fft_plan->input_dtype;
size_t complex_dtype_size =
(mluOpDataTypeBytes(out_c_dtype) > mluOpDataTypeBytes(in_c_dtype))
? mluOpDataTypeBytes(out_c_dtype)
: mluOpDataTypeBytes(in_c_dtype);

int batch = fft_plan->batch;
size_t buffer_size = batch * complex_dtype_size * _n0 * _n1;

size_t twiddles_size = complex_dtype_size * _n0;
size_t twiddles_size_2d = complex_dtype_size * _n1;

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
reservespace_size = complex_dtype_size * _n0 * _n0 * 2 +
complex_dtype_size * _n1 * _n1 * 2; /* DFT matrix */
workspace_size = complex_dtype_size * _n1 * _n0 * batch * 6;
} else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1)
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

if (fft_plan->n[0] != fft_plan->inembed[0] ||
fft_plan->n[1] != fft_plan->inembed[1]) {
workspace_size += buffer_size;
}
fft_plan->workspace_size = workspace_size;
fft_plan->reservespace_size = reservespace_size;

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
Expand Down Expand Up @@ -1822,12 +1872,16 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

if (fft_plan->n[0] > fft_plan->inembed[0] ||
fft_plan->n[1] > fft_plan->inembed[1]) {
if (fft_plan->n[0] != fft_plan->inembed[0] ||
fft_plan->n[1] != fft_plan->inembed[1]) {
workspace_size += buffer_size;
}
fft_plan->workspace_size = workspace_size;
Expand All @@ -1846,6 +1900,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2C1D(
const int rank, const int *n) {
fft_plan->is_batch_contiguous =
(fft_plan->idist == 1 && fft_plan->odist == 1 &&
fft_plan->inembed[0] == fft_plan->n[0] &&
fft_plan->onembed[0] == fft_plan->n[0] &&
fft_plan->istride == fft_plan->batch &&
fft_plan->ostride == fft_plan->batch) &&
(fft_plan->n[0] == fft_plan->inembed[0]);
Expand Down Expand Up @@ -2221,7 +2277,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2R2D(
fft_plan->fft_strategy = CNFFT_FUNC_TWO_LEVEL_STOCKHAM;
}

mluOpAllocateRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);
mluOpAllocateIRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
switch (fft_plan->fft_type) {
Expand Down Expand Up @@ -2394,6 +2450,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanMany(
fft_plan->inembed[i] = input_desc->dims[fft_plan->idim - rank + i];
fft_plan->onembed[i] = output_desc->dims[fft_plan->odim - rank + i];
}
for (auto i = 0; i < fft_plan->idim; i++) {
fft_plan->in_stride[i] = input_desc->strides[i];
}
for (auto i = 0; i < fft_plan->odim; i++) {
fft_plan->out_stride[i] = output_desc->strides[i];
}
if (fft_plan->idim == rank + 1) {
fft_plan->idist = input_desc->strides[0];
fft_plan->odist = output_desc->strides[0];
Expand Down
30 changes: 17 additions & 13 deletions kernels/fft/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ struct cnfftButterflyAddrs {
int *factors;
int *factors_2d;
void *input_pad_addr;
void *input_copy_workspace_addr;
void *output_copy_workspace_addr;
};
struct mluOpFFTStruct {
int rank; // rank of FFT
Expand All @@ -193,24 +195,26 @@ struct mluOpFFTStruct {
int inum; // element num of input tensor
int istride; // distance between two successive input elements in the
// innermost dimension
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int in_stride[FFT_DIM_MAX + 1];
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int onembed[FFT_DIM_MAX]; // Pointer of size rank that indicates the storage
// dimensions of the output data in memory
int onum; // element num of output tensor
int ostride; // distance between two successive output elements in the
// innermost dimension
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
int out_stride[FFT_DIM_MAX + 1];
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
bool is_input_contiguous;
bool is_output_contiguous;
bool is_batch_contiguous;
Expand Down
Loading

0 comments on commit 7ec2c6b

Please sign in to comment.