Skip to content

Commit

Permalink
[Fix](mluOpExecFFT): fix core dump, scale factor and one point comput…
Browse files Browse the repository at this point in the history
…e error
  • Loading branch information
niyuming committed Nov 29, 2024
1 parent 1725280 commit 85697d2
Show file tree
Hide file tree
Showing 11 changed files with 318 additions and 590 deletions.
53 changes: 45 additions & 8 deletions kernels/fft/c2c_fft/c2c_fft_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,9 @@ static void configureFFT2dWorkspaceAddrs(mluOpHandle_t handle,
fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset;
offset += batch * in_c_dtype_size * _n0 * _n1 * 2;

if (fft_plan->is_input_contiguous) {
if ((fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])) {
fft_plan->mlu_addrs.input = input;
} else {
fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset;
Expand Down Expand Up @@ -1180,9 +1182,11 @@ static mluOpStatus_t makeFFT2dContiguousInput(mluOpHandle_t handle,
int64_t dims[in_dim_num] = {fft_plan->batch,
std::min(fft_plan->n[0], fft_plan->inembed[0]),
std::min(fft_plan->n[1], fft_plan->inembed[1])};
int64_t strides[in_dim_num] = {fft_plan->idist,
(fft_plan->istride * fft_plan->inembed[1]),
fft_plan->istride};

int64_t strides[3];
for (int i = 0; i < in_dim_num; i++) {
strides[i] = fft_plan->in_stride[i];
}
status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
fft_plan->input_dtype, in_dim_num,
dims, strides);
Expand Down Expand Up @@ -1818,9 +1822,10 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle,
const int out_dim_num = 3;
int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
fft_plan->n[1]};
int64_t strides[out_dim_num] = {fft_plan->odist,
fft_plan->ostride * fft_plan->onembed[1],
fft_plan->ostride};
int64_t strides[3];
for (int i = 0; i < out_dim_num; i++) {
strides[i] = fft_plan->out_stride[i];
}
status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
out_c_dtype, out_dim_num, dims);
INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
Expand Down Expand Up @@ -2053,7 +2058,39 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr;
}

status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) {
mluOpTensorDescriptor_t c_desc = nullptr;
status = mluOpCreateTensorDescriptor(&c_desc);
const int out_dim_num = 3;
int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
fft_plan->n[1]};
status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
fft_plan->output_dtype, 2, dims);
status = mluOpSetTensorDescriptorOnchipDataType(c_desc,
fft_plan->execution_dtype);

DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
cnnl_handle); // convert to cnnl_handle

DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc);

size_t workspace_size = 0;
CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_output_desc,
cnnl_output_desc, &workspace_size));
void *workspace = nullptr;
if (workspace_size > 0) {
CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
}

CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_output_desc,
fft_plan->mlu_addrs.input, cnnl_output_desc,
fft_plan->mlu_addrs.output, workspace,
workspace_size));
DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
DESTROY_CNNL_HANDLE(cnnl_handle);
} else {
status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
}

INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);

Expand Down
3 changes: 2 additions & 1 deletion kernels/fft/common/fft_common_kernels.mlu
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
__asm__ volatile(
"gather.clean.nram.nram.nram.b32.u32 "
"[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
[src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
[ src ] "r"(src_addr), [ offset ] "r"(offset_int_addr),
[ data_num ] "r"(deal_size));
#else
for (auto i = 0; i < deal_size; i++) {
dst_addr[i] = src_addr[offset_int_addr[i]];
Expand Down
68 changes: 65 additions & 3 deletions kernels/fft/fft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1740,7 +1740,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D(
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

Expand Down Expand Up @@ -1791,7 +1795,53 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateIRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
const int _n0, const int _n1) {
const std::string make_plan_api = "[mluOpAllocateIRFFT2D]";
size_t workspace_size = 0, reservespace_size = 0;

mluOpDataType_t out_c_dtype = fft_plan->output_dtype;
mluOpDataType_t in_c_dtype = fft_plan->input_dtype;
size_t complex_dtype_size =
(mluOpDataTypeBytes(out_c_dtype) > mluOpDataTypeBytes(in_c_dtype))
? mluOpDataTypeBytes(out_c_dtype)
: mluOpDataTypeBytes(in_c_dtype);

int batch = fft_plan->batch;
size_t buffer_size = batch * complex_dtype_size * _n0 * _n1;

size_t twiddles_size = complex_dtype_size * _n0;
size_t twiddles_size_2d = complex_dtype_size * _n1;

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
reservespace_size = complex_dtype_size * _n0 * _n0 * 2 +
complex_dtype_size * _n1 * _n1 * 2; /* DFT matrix */
workspace_size = complex_dtype_size * _n1 * _n0 * batch * 6;
} else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+ sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1)
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

if (fft_plan->n[0] > fft_plan->inembed[0] ||
fft_plan->n[1] > fft_plan->inembed[1]) {
workspace_size += buffer_size;
}
fft_plan->workspace_size = workspace_size;
fft_plan->reservespace_size = reservespace_size;

return MLUOP_STATUS_SUCCESS;
}
mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
Expand Down Expand Up @@ -1822,7 +1872,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
DFT_TABLE_SIZE * 2; /* twiddles */
workspace_size = buffer_size * 2;
workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
workspace_size += (fft_plan->is_input_contiguous &&
fft_plan->inembed[0] <= fft_plan->n[0] &&
fft_plan->inembed[1] <= fft_plan->n[1])
? 0
: buffer_size;
workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
}

Expand All @@ -1846,6 +1900,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2C1D(
const int rank, const int *n) {
fft_plan->is_batch_contiguous =
(fft_plan->idist == 1 && fft_plan->odist == 1 &&
fft_plan->inembed[0] == fft_plan->n[0] &&
fft_plan->onembed[0] == fft_plan->n[0] &&
fft_plan->istride == fft_plan->batch &&
fft_plan->ostride == fft_plan->batch) &&
(fft_plan->n[0] == fft_plan->inembed[0]);
Expand Down Expand Up @@ -2221,7 +2277,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2R2D(
fft_plan->fft_strategy = CNFFT_FUNC_TWO_LEVEL_STOCKHAM;
}

mluOpAllocateRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);
mluOpAllocateIRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);

if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
switch (fft_plan->fft_type) {
Expand Down Expand Up @@ -2394,6 +2450,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanMany(
fft_plan->inembed[i] = input_desc->dims[fft_plan->idim - rank + i];
fft_plan->onembed[i] = output_desc->dims[fft_plan->odim - rank + i];
}
for (auto i = 0; i < fft_plan->idim; i++) {
fft_plan->in_stride[i] = input_desc->strides[i];
}
for (auto i = 0; i < fft_plan->odim; i++) {
fft_plan->out_stride[i] = output_desc->strides[i];
}
if (fft_plan->idim == rank + 1) {
fft_plan->idist = input_desc->strides[0];
fft_plan->odist = output_desc->strides[0];
Expand Down
28 changes: 15 additions & 13 deletions kernels/fft/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,24 +193,26 @@ struct mluOpFFTStruct {
int inum; // element num of input tensor
int istride; // distance between two successive input elements in the
// innermost dimension
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int in_stride[FFT_DIM_MAX + 1];
int idist; // distance between the first element of two consecutive signals
// in a batch of the input data
int odim; // the dimension size of output tensor
int onembed[FFT_DIM_MAX]; // Pointer of size rank that indicates the storage
// dimensions of the output data in memory
int onum; // element num of output tensor
int ostride; // distance between two successive output elements in the
// innermost dimension
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
int out_stride[FFT_DIM_MAX + 1];
int odist; // distance between the first element of two consecutive signals
// in a batch of the output data
int batch; // batch size for this transform
int L; // n = L * 2^m, L size for this transform
int m; // n = L * 2^m, m size for this transform
int s; // The size that can be put down on NRAM: L * 2^s, only used by
// Cooley-Tukey algorithm
int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used
// by Stockham algorithm
int prime; // wether fft1d'size contains a prime number > 64
bool is_input_contiguous;
bool is_output_contiguous;
bool is_batch_contiguous;
Expand Down
Loading

0 comments on commit 85697d2

Please sign in to comment.