[Fix](mluOpExecFFT): fix core dump, scale factor and one point comput…

…e error
Cambricon · Dec 5, 2024 · c73d48f · c73d48f
1 parent 1725280
commit c73d48f
Show file tree

Hide file tree

Showing 12 changed files with 383 additions and 682 deletions.
diff --git a/kernels/fft/c2c_fft/c2c_fft_host.cpp b/kernels/fft/c2c_fft/c2c_fft_host.cpp
@@ -1079,7 +1079,9 @@ static void configureFFT2dWorkspaceAddrs(mluOpHandle_t handle,
     fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset;
     offset += batch * in_c_dtype_size * _n0 * _n1 * 2;
 
-    if (fft_plan->is_input_contiguous) {
+    if ((fft_plan->is_input_contiguous &&
+         fft_plan->inembed[0] <= fft_plan->n[0] &&
+         fft_plan->inembed[1] <= fft_plan->n[1])) {
       fft_plan->mlu_addrs.input = input;
     } else {
       fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset;
@@ -1180,9 +1182,11 @@ static mluOpStatus_t makeFFT2dContiguousInput(mluOpHandle_t handle,
     int64_t dims[in_dim_num] = {fft_plan->batch,
                                 std::min(fft_plan->n[0], fft_plan->inembed[0]),
                                 std::min(fft_plan->n[1], fft_plan->inembed[1])};
-    int64_t strides[in_dim_num] = {fft_plan->idist,
-                                   (fft_plan->istride * fft_plan->inembed[1]),
-                                   fft_plan->istride};
+
+    int64_t strides[in_dim_num];  // in_dim_num
+    for (int i = 0; i < in_dim_num; i++) {
+      strides[i] = fft_plan->in_stride[i];
+    }
     status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY,
                                            fft_plan->input_dtype, in_dim_num,
                                            dims, strides);
@@ -1779,17 +1783,8 @@ static mluOpStatus_t makeFFT1dContiguousOutput(mluOpHandle_t handle,
                                                  cnnl_copy_src_desc);
     DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
                                                  cnnl_copy_dst_desc);
-    size_t workspace_size = 0;
-    CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
-                                       cnnl_copy_dst_desc, &workspace_size));
-
-    void *workspace = nullptr;
-    if (workspace_size > 0) {
-      CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
-    }
     CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
-                          cnnl_copy_dst_desc, output, workspace,
-                          workspace_size));
+                          cnnl_copy_dst_desc, output, NULL, 0));
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
     DESTROY_CNNL_HANDLE(cnnl_handle);
@@ -1818,9 +1813,10 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle,
     const int out_dim_num = 3;
     int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
                                  fft_plan->n[1]};
-    int64_t strides[out_dim_num] = {fft_plan->odist,
-                                    fft_plan->ostride * fft_plan->onembed[1],
-                                    fft_plan->ostride};
+    int64_t strides[out_dim_num];  // out_dim_num
+    for (int i = 0; i < out_dim_num; i++) {
+      strides[i] = fft_plan->out_stride[i];
+    }
     status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY,
                                          out_c_dtype, out_dim_num, dims);
     INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
@@ -1838,18 +1834,8 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle,
                                                  cnnl_copy_src_desc);
     DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc,
                                                  cnnl_copy_dst_desc);
-
-    size_t workspace_size = 0;
-    CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc,
-                                       cnnl_copy_dst_desc, &workspace_size));
-
-    void *workspace = nullptr;
-    if (workspace_size > 0) {
-      CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
-    }
     CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr,
-                          cnnl_copy_dst_desc, output, workspace,
-                          workspace_size));
+                          cnnl_copy_dst_desc, output, NULL, 0));
 
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc);
@@ -2003,12 +1989,15 @@ mluOpStatus_t execFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
       const float beta[2] = {0.0, 0.0};
       mluOpTensorDescriptor_t c_desc = nullptr;
       status = mluOpCreateTensorDescriptor(&c_desc);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
       const int out_dim_num = 2;
       int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0]};
       status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
                                            fft_plan->output_dtype, 2, dims);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
       status = mluOpSetTensorDescriptorOnchipDataType(
           c_desc, fft_plan->execution_dtype);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
       DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
                                         cnnl_handle);  // convert to cnnl_handle
@@ -2019,6 +2008,8 @@ mluOpStatus_t execFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
                                  cnnl_output_desc, fft_plan->mlu_addrs.output,
                                  &beta, cnnl_output_desc,
                                  fft_plan->mlu_addrs.output));
+      status = mluOpDestroyTensorDescriptor(c_desc);
+      INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
       DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
       DESTROY_CNNL_HANDLE(cnnl_handle);
     }
@@ -2053,7 +2044,34 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
     fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr;
   }
 
-  status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
+  if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) {
+    mluOpTensorDescriptor_t c_desc = nullptr;
+    status = mluOpCreateTensorDescriptor(&c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    const int out_dim_num = 3;
+    int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
+                                 fft_plan->n[1]};
+    status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
+                                         fft_plan->output_dtype, 2, dims);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    status = mluOpSetTensorDescriptorOnchipDataType(c_desc,
+                                                    fft_plan->execution_dtype);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+
+    DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
+                                      cnnl_handle);  // convert to cnnl_handle
+
+    DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc);
+    CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_output_desc,
+                          fft_plan->mlu_addrs.input, cnnl_output_desc,
+                          fft_plan->mlu_addrs.output, NULL, 0));
+    status = mluOpDestroyTensorDescriptor(c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
+    DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
+    DESTROY_CNNL_HANDLE(cnnl_handle);
+  } else {
+    status = execFFTc2c2d(handle, fft_plan, scale_factor, direction);
+  }
 
   INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
@@ -2062,13 +2080,16 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
     const float beta[2] = {0.0, 0.0};
     mluOpTensorDescriptor_t c_desc = nullptr;
     status = mluOpCreateTensorDescriptor(&c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     const int out_dim_num = 3;
     int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0],
                                  fft_plan->n[1]};
     status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY,
-                                         fft_plan->output_dtype, 3, dims);
+                                         fft_plan->output_dtype, out_dim_num, dims);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     status = mluOpSetTensorDescriptorOnchipDataType(c_desc,
                                                     fft_plan->execution_dtype);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
 
     DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle,
                                       cnnl_handle);  // convert to cnnl_handle
@@ -2079,6 +2100,8 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan,
                                cnnl_output_desc, fft_plan->mlu_addrs.output,
                                &beta, cnnl_output_desc,
                                fft_plan->mlu_addrs.output));
+    status = mluOpDestroyTensorDescriptor(c_desc);
+    INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS);
     DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc);
     DESTROY_CNNL_HANDLE(cnnl_handle);
   }
@@ -2296,22 +2319,22 @@ mluOpStatus_t computeFFT2dMatMulRow(mluOpHandle_t handle,
   int requested_algo_count = 1, return_algo_count = 0;
   float *workspace;
   size_t workspace_size;
-  cnnlGetBatchMatMulAlgoHeuristic(
+  cnnlGetBatchMatMulExAlgoHeuristic(
       cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
       requested_algo_count, &heuristic_result, &return_algo_count);
 
-  cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
+  cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
 
   if (workspace_size > 0) {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
   } else {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
   }
 
-  CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
-                                    cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
-                                    in_addr, &beta, cnnl_c_desc, out_addr,
-                                    (void *)workspace, workspace_size));
+  CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
+                              cnnl_a_desc, dft_matrix_addr, cnnl_b_desc,
+                              in_addr, &beta, cnnl_c_desc, out_addr,
+                              (void *)workspace, workspace_size));
   // destroy cnnl descriptor
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc);

diff --git a/kernels/fft/common/fft_basic_ops.cpp b/kernels/fft/common/fft_basic_ops.cpp
@@ -488,10 +488,10 @@ mluOpStatus_t fftGetBatchMatMulBcastWorkspaceSize(
   cnnlMatMulHeuristicResult_t heuristic_result;
   CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result));
   int requested_algo_count = 1, return_algo_count = 0;
-  cnnlGetBatchMatMulAlgoHeuristic(
+  cnnlGetBatchMatMulExAlgoHeuristic(
       cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
       requested_algo_count, &heuristic_result, &return_algo_count);
-  cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
+  cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
   // destroy descriptor
   // destroy cnnl descriptor
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);
@@ -585,20 +585,20 @@ mluOpStatus_t fftBatchMatMulBcast(
   alpha = 1.0;
   beta = 0.0;
   int requested_algo_count = 1, return_algo_count = 0;
-  cnnlGetBatchMatMulAlgoHeuristic(
+  cnnlGetBatchMatMulExAlgoHeuristic(
       cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL,
       requested_algo_count, &heuristic_result, &return_algo_count);
-  cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size);
+  cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size);
   if (workspace_size > 0) {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size));
   } else {
     CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float)));
   }
 
-  CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha,
-                                    cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
-                                    &beta, cnnl_c_desc, c_ptr,
-                                    (void *)workspace, workspace_size));
+  CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha,
+                              cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr,
+                              &beta, cnnl_c_desc, c_ptr,
+                              (void *)workspace, workspace_size));
   // destroy descriptor
   // destroy cnnl descriptor
   DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc);

diff --git a/kernels/fft/common/fft_common_kernels.mlu b/kernels/fft/common/fft_common_kernels.mlu
@@ -109,7 +109,8 @@ __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr,
   __asm__ volatile(
       "gather.clean.nram.nram.nram.b32.u32 "
       "[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr),
-      [src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size));
+      [ src ] "r"(src_addr), [ offset ] "r"(offset_int_addr),
+      [ data_num ] "r"(deal_size));
 #else
   for (auto i = 0; i < deal_size; i++) {
     dst_addr[i] = src_addr[offset_int_addr[i]];

diff --git a/kernels/fft/fft.cpp b/kernels/fft/fft.cpp
@@ -1657,7 +1657,7 @@ mluOpAllocateC2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
                      fft_plan->is_batch_contiguous)
                         ? 0
                         : buffer_size;
-  if (fft_plan->n[0] > fft_plan->inembed[0]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0]) {
     workspace_size += buffer_size;
   }
   size_t twiddles_size = in_c_dtype_size * nfft * 2;
@@ -1701,7 +1701,7 @@ mluOpAllocateR2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
   reservespace_size = sizeof(int) * (FFT_MAXFACTORS)            /* factors */
                       + twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */
 
-  if (fft_plan->n[0] > fft_plan->inembed[0]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0]) {
     workspace_size += buffer_size;  // input_pad_addr
   }
   fft_plan->workspace_size = workspace_size;
@@ -1740,13 +1740,17 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D(
                         DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
                         DFT_TABLE_SIZE * 2; /* twiddles */
     workspace_size = buffer_size * 2;
-    workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
+    workspace_size += (fft_plan->is_input_contiguous &&
+                       fft_plan->inembed[0] <= fft_plan->n[0] &&
+                       fft_plan->inembed[1] <= fft_plan->n[1])
+                          ? 0
+                          : buffer_size;
     workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
   }
 
   fft_plan->workspace_size = workspace_size;
-  if (fft_plan->n[0] > fft_plan->inembed[0] ||
-      fft_plan->n[1] > fft_plan->inembed[1]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0] ||
+      fft_plan->n[1] != fft_plan->inembed[1]) {
     fft_plan->workspace_size = workspace_size + buffer_size;  // input_pad_addr
   }
   fft_plan->reservespace_size = reservespace_size;
@@ -1783,15 +1787,61 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
   reservespace_size = sizeof(int) * (FFT_MAXFACTORS)            /* factors */
                       + twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */
 
-  if (fft_plan->n[0] > fft_plan->inembed[0]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0]) {
     workspace_size += buffer_size;  // input_pad_addr
   }
   fft_plan->workspace_size = workspace_size;
   fft_plan->reservespace_size = reservespace_size;
 
   return MLUOP_STATUS_SUCCESS;
 }
+mluOpStatus_t MLUOP_WIN_API mluOpAllocateIRFFT2D(
+    mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
+    mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
+    const int _n0, const int _n1) {
+  const std::string make_plan_api = "[mluOpAllocateIRFFT2D]";
+  size_t workspace_size = 0, reservespace_size = 0;
+
+  mluOpDataType_t out_c_dtype = fft_plan->output_dtype;
+  mluOpDataType_t in_c_dtype = fft_plan->input_dtype;
+  size_t complex_dtype_size =
+      (mluOpDataTypeBytes(out_c_dtype) > mluOpDataTypeBytes(in_c_dtype))
+          ? mluOpDataTypeBytes(out_c_dtype)
+          : mluOpDataTypeBytes(in_c_dtype);
+
+  int batch = fft_plan->batch;
+  size_t buffer_size = batch * complex_dtype_size * _n0 * _n1;
+
+  size_t twiddles_size = complex_dtype_size * _n0;
+  size_t twiddles_size_2d = complex_dtype_size * _n1;
+
+  if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
+    reservespace_size = complex_dtype_size * _n0 * _n0 * 2 +
+                        complex_dtype_size * _n1 * _n1 * 2; /* DFT matrix */
+    workspace_size = complex_dtype_size * _n1 * _n0 * batch * 6;
+  } else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) {
+    reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */
+                        + sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 +
+                        DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
+                        DFT_TABLE_SIZE * 2; /* twiddles */
+    workspace_size = buffer_size * 2;
+    workspace_size += (fft_plan->is_input_contiguous &&
+                       fft_plan->inembed[0] <= fft_plan->n[0] &&
+                       fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1)
+                          ? 0
+                          : buffer_size;
+    workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
+  }
 
+  if (fft_plan->n[0] != fft_plan->inembed[0] ||
+      fft_plan->n[1] != fft_plan->inembed[1]) {
+    workspace_size += buffer_size;
+  }
+  fft_plan->workspace_size = workspace_size;
+  fft_plan->reservespace_size = reservespace_size;
+
+  return MLUOP_STATUS_SUCCESS;
+}
 mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
     mluOpHandle_t handle, mluOpFFTPlan_t fft_plan,
     mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc,
@@ -1822,12 +1872,16 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D(
                         DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 +
                         DFT_TABLE_SIZE * 2; /* twiddles */
     workspace_size = buffer_size * 2;
-    workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size;
+    workspace_size += (fft_plan->is_input_contiguous &&
+                       fft_plan->inembed[0] <= fft_plan->n[0] &&
+                       fft_plan->inembed[1] <= fft_plan->n[1])
+                          ? 0
+                          : buffer_size;
     workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size;
   }
 
-  if (fft_plan->n[0] > fft_plan->inembed[0] ||
-      fft_plan->n[1] > fft_plan->inembed[1]) {
+  if (fft_plan->n[0] != fft_plan->inembed[0] ||
+      fft_plan->n[1] != fft_plan->inembed[1]) {
     workspace_size += buffer_size;
   }
   fft_plan->workspace_size = workspace_size;
@@ -1846,6 +1900,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2C1D(
     const int rank, const int *n) {
   fft_plan->is_batch_contiguous =
       (fft_plan->idist == 1 && fft_plan->odist == 1 &&
+       fft_plan->inembed[0] == fft_plan->n[0] &&
+       fft_plan->onembed[0] == fft_plan->n[0] &&
        fft_plan->istride == fft_plan->batch &&
        fft_plan->ostride == fft_plan->batch) &&
       (fft_plan->n[0] == fft_plan->inembed[0]);
@@ -2221,7 +2277,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2R2D(
     fft_plan->fft_strategy = CNFFT_FUNC_TWO_LEVEL_STOCKHAM;
   }
 
-  mluOpAllocateRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);
+  mluOpAllocateIRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]);
 
   if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) {
     switch (fft_plan->fft_type) {
@@ -2394,6 +2450,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanMany(
     fft_plan->inembed[i] = input_desc->dims[fft_plan->idim - rank + i];
     fft_plan->onembed[i] = output_desc->dims[fft_plan->odim - rank + i];
   }
+  for (auto i = 0; i < fft_plan->idim; i++) {
+    fft_plan->in_stride[i] = input_desc->strides[i];
+  }
+  for (auto i = 0; i < fft_plan->odim; i++) {
+    fft_plan->out_stride[i] = output_desc->strides[i];
+  }
   if (fft_plan->idim == rank + 1) {
     fft_plan->idist = input_desc->strides[0];
     fft_plan->odist = output_desc->strides[0];