Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix](mluOpExecFFT): fix fft bug. #1130

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 45 additions & 8 deletions kernels/fft/c2c_fft/c2c_fft_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,9 @@ static void configureFFT2dWorkspaceAddrs(mluOpHandle_t handle,
fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset;
offset += batch * in_c_dtype_size * _n0 * _n1 * 2;

if (fft_plan->is_input_contiguous) {
if ((fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])) {
fft_plan->mlu_addrs.input = input;
} else {
fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset;
Expand Down Expand Up @@ -1180,9 +1182,11 @@ static mluOpStatus_t makeFFT2dContiguousInput(mluOpHandle_t handle,
int64_t dims[in_dim_num] = {fft_plan->batch,
std::min(fft_plan->n[0], fft_plan->inembed[0]),
std::min(fft_plan->n[1], fft_plan->inembed[1])};
int64_t strides[in_dim_num] = {fft_plan->idist,
(fft_plan->istride * fft_plan->inembed[1]),
fft_plan->istride};

int64_t strides[3];
for (int i = 0; i < in_dim_num; i++) {
strides[i] = fft_plan->in_stride[i];
}
status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
fft_plan->input_dtype, in_dim_num,
dims, strides);
Expand Down Expand Up @@ -1818,9 +1822,10 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle,
const int out_dim_num = 3;
int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
fft_plan->n[1]};
int64_t strides[out_dim_num] = {fft_plan->odist,
fft_plan->ostride * fft_plan->onembed[1],
fft_plan->ostride};
int64_t strides[3];
for (int i = 0; i < out_dim_num; i++) {
strides[i] = fft_plan->out_stride[i];
}
status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
out_c_dtype, out_dim_num, dims);
INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
Expand Down Expand Up @@ -2053,7 +2058,39 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr;
}

status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) {
mluOpTensorDescriptor_t c_desc = nullptr;
status = mluOpCreateTensorDescriptor(&c_desc);
const int out_dim_num = 3;
int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
fft_plan->n[1]};
status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
fft_plan->output_dtype, 2, dims);
status = mluOpSetTensorDescriptorOnchipDataType(c_desc,
fft_plan->execution_dtype);

DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
cnnl_handle); // convert to cnnl_handle

DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc);

size_t workspace_size = 0;
CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_output_desc,
cnnl_output_desc, &workspace_size));
void *workspace = nullptr;
if (workspace_size > 0) {
CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
}

CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_output_desc,
fft_plan->mlu_addrs.input, cnnl_output_desc,
fft_plan->mlu_addrs.output, workspace,
workspace_size));
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
DESTROY_CNNL_HANDLE(cnnl_handle);
} else {
status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
}

INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);

Expand Down
3 changes: 2 additions & 1 deletion kernels/fft/common/fft_common_kernels.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
__asm__ volatile(
"gather.clean.nram.nram.nram.b32.u32 "
"[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
[src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
[ src ] "r"(src_addr), [ offset ] "r"(offset_int_addr),
[ data_num ] "r"(deal_size));
#else
for (auto i = 0; i < deal_size; i++) {
dst_addr[i] = src_addr[offset_int_addr[i]];
Expand Down
68 changes: 65 additions & 3 deletions kernels/fft/fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1740,7 +1740,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D(
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

Expand Down Expand Up @@ -1791,7 +1795,53 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateIRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
const int _n0, const int _n1) {
const std::string make_plan_api = "[mluOpAllocateIRFFT2D]";
size_t workspace_size = 0, reservespace_size = 0;

mluOpDataType_t out_c_dtype = fft_plan->output_dtype;
mluOpDataType_t in_c_dtype = fft_plan->input_dtype;
size_t complex_dtype_size =
(mluOpDataTypeBytes(out_c_dtype) > mluOpDataTypeBytes(in_c_dtype))
? mluOpDataTypeBytes(out_c_dtype)
: mluOpDataTypeBytes(in_c_dtype);

int batch = fft_plan->batch;
size_t buffer_size = batch * complex_dtype_size * _n0 * _n1;

size_t twiddles_size = complex_dtype_size * _n0;
size_t twiddles_size_2d = complex_dtype_size * _n1;

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
reservespace_size = complex_dtype_size * _n0 * _n0 * 2 +
complex_dtype_size * _n1 * _n1 * 2; /* DFT matrix */
workspace_size = complex_dtype_size * _n1 * _n0 * batch * 6;
} else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1)
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

if (fft_plan->n[0] > fft_plan->inembed[0] ||
fft_plan->n[1] > fft_plan->inembed[1]) {
workspace_size += buffer_size;
}
fft_plan->workspace_size = workspace_size;
fft_plan->reservespace_size = reservespace_size;

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
Expand Down Expand Up @@ -1822,7 +1872,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

Expand All @@ -1846,6 +1900,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2C1D(
const int rank, const int *n) {
fft_plan->is_batch_contiguous =
(fft_plan->idist == 1 && fft_plan->odist == 1 &&
fft_plan->inembed[0] == fft_plan->n[0] &&
fft_plan->onembed[0] == fft_plan->n[0] &&
fft_plan->istride == fft_plan->batch &&
fft_plan->ostride == fft_plan->batch) &&
(fft_plan->n[0] == fft_plan->inembed[0]);
Expand Down Expand Up @@ -2221,7 +2277,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2R2D(
fft_plan->fft_strategy = CNFFT_FUNC_TWO_LEVEL_STOCKHAM;
}

mluOpAllocateRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);
mluOpAllocateIRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
switch (fft_plan->fft_type) {
Expand Down Expand Up @@ -2394,6 +2450,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanMany(
fft_plan->inembed[i] = input_desc->dims[fft_plan->idim - rank + i];
fft_plan->onembed[i] = output_desc->dims[fft_plan->odim - rank + i];
}
for (auto i = 0; i < fft_plan->idim; i++) {
fft_plan->in_stride[i] = input_desc->strides[i];
}
for (auto i = 0; i < fft_plan->odim; i++) {
fft_plan->out_stride[i] = output_desc->strides[i];
}
if (fft_plan->idim == rank + 1) {
fft_plan->idist = input_desc->strides[0];
fft_plan->odist = output_desc->strides[0];
Expand Down
28 changes: 15 additions & 13 deletions kernels/fft/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,24 +193,26 @@ struct mluOpFFTStruct {
int inum; // element num of input tensor
int istride; // distance between two successive input elements in the
// innermost dimension
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int in_stride[FFT_DIM_MAX + 1];
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int onembed[FFT_DIM_MAX]; // Pointer of size rank that indicates the storage
// dimensions of the output data in memory
int onum; // element num of output tensor
int ostride; // distance between two successive output elements in the
// innermost dimension
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
int out_stride[FFT_DIM_MAX + 1];
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
bool is_input_contiguous;
bool is_output_contiguous;
bool is_batch_contiguous;
Expand Down
Loading
Loading