From 5445c9def0090738377d72d87cb310bdc949cd48 Mon Sep 17 00:00:00 2001 From: mahxn0 <1262384588@qq.com> Date: Tue, 3 Dec 2024 18:52:12 +0800 Subject: [PATCH 1/7] [Fix](mlu-ops): modify common func. (#1167) --- kernels/kernel.h | 2 +- .../ms_deform_attn_backward_fast_union1.mlu | 4 ++-- .../ms_deform_attn_forward/ms_deform_attn_utils.h | 2 +- .../ms_deform_attn_forward/msda_forward_fast_union1.mlu | 4 ++-- kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h | 2 +- kernels/utils/common.h | 2 +- .../pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/kernels/kernel.h b/kernels/kernel.h index d1a6e96fb..9378f0839 100644 --- a/kernels/kernel.h +++ b/kernels/kernel.h @@ -31,7 +31,7 @@  * Macros for mluop kernels  ******************************************************************************/ // in future, can be "__BANG_ARCH__ == 592 || __BANG_ARCH__ == xxx || ...)" -#define ARCH_SUPPORT_LARGE_TENSOR (__BANG_ARCH__ == 592) +#define ARCH_SUPPORT_LARGE_TENSOR (__BANG_ARCH__ >= 592) #define MAX_WRAM_SIZE (__MLU_WRAM_SIZE__ * 1024) #define WRAM_LT_STRIDE (__MLU_WRAM_SIZE__ * 1024 / 64) diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu index 21ee0b40d..b72087481 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu @@ -26,7 +26,7 @@ #include "core/logging.h" -#if (__BANG_ARCH__ == 592) +#if (__BANG_ARCH__ >= 592) #define MAX_MEMCPY_SEGNUM (65536) #define NRAM_REMAIN_SIZE (48 * 1024) @@ -454,7 +454,7 @@ __mlu_global__ void MLUUnion1KernelMsDeformAttnBackwardFastKernel( const int32_t channels, const int32_t num_levels, const int32_t num_query, const int32_t num_points, float* grad_value, float* grad_sampling_loc, float* grad_attn_weight) { -#if (__BANG_ARCH__ == 592) +#if (__BANG_ARCH__ >= 592) using T = float; const int32_t num_keys = spatial_size; const int32_t input_stride_4 = diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h index 122d2d35e..0f4b4dd17 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h @@ -376,7 +376,7 @@ __mlu_func__ void stageOneLoop( } #endif -#if (__BANG_ARCH__ == 592) +#if (__BANG_ARCH__ >= 592) __mlu_func__ void gatherAsync(void* dst, void* src, unsigned int* offset, void* mask, int transfer_size, mluMemcpyDirection_t dir, int dst_stride, diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu index b21af0a0e..2d29981e2 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu @@ -906,7 +906,7 @@ __mlu_func__ void MLUKernelMsDeformAttnForwardFastImpl( } } -#if (__BANG_ARCH__ == 592) +#if (__BANG_ARCH__ >= 592) /* The shape of each tensor on nram: @@ -1260,7 +1260,7 @@ __mlu_global__ void MLUKernelMsDeformAttnForwardFast( } #endif -#if (__BANG_ARCH__ == 592) +#if (__BANG_ARCH__ >= 592) MLUKernelMsDeformAttnForwardFastImpl( data_value_gdram, data_spatial_shapes_gdram, data_level_start_index_gdram, data_sampling_loc_gdram, data_attn_weight_gdram, batch_size, num_keys, diff --git a/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h b/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h index 52a135c7f..259b67e8b 100644 --- a/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h +++ b/kernels/sparse_conv/get_indice_pairs/get_indice_pairs_utils.h @@ -76,7 +76,7 @@ func: generate stage index from start_index */ __mlu_func__ void stepIndex(int32_t *dst_nram, int32_t start_index, int32_t length) { -#if (__BANG_ARCH__ == 372 || __BANG_ARCH__ == 322 || __BANG_ARCH__ == 592) +#if __BANG_ARCH__ >= 372 int32_t align_num = 128; int32_t repeat = (int32_t)(logf(length / align_num) / logf(2)); int32_t remain = length / align_num - powf(2, repeat); diff --git a/kernels/utils/common.h b/kernels/utils/common.h index bceb8ccd4..c6bd1aead 100644 --- a/kernels/utils/common.h +++ b/kernels/utils/common.h @@ -419,7 +419,7 @@ __mlu_func__ void __mluop_store_str_3D(T *dst, T *src, int size, int seg_num_in, * dst_nram only support nram. * ****************************************************************************/ __mlu_func__ void __mluop_get_stage_indices_tfuse(int *dst_nram, int length) { -#if (__BANG_ARCH__ == 372 || __BANG_ARCH__ == 592) +#if __BANG_ARCH__ >= 372 int align_num = 128; int repeat = (int)(logf(length / align_num) / logf(2)); int remain = length / align_num - powf(2, repeat); diff --git a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu index c5b9077e7..fbb93ddb1 100644 --- a/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu +++ b/test/mlu_op_gtest/pb_gtest/src/internal_kernel/fill_llc/fill_llc_device.mlu @@ -24,7 +24,7 @@ #include "kernels/kernel.h" // MAX_NRAM_SIZE __mlu_global__ void flushLLC(void* input, int fill_bytes) { -#if (__BANG_ARCH__ == 592) +#if (__BANG_ARCH__ >= 592) if (coreId != 0) { return; } From 3a674cc184e9ae8fbd010facf8bb8a2aab5557ff Mon Sep 17 00:00:00 2001 From: duzekun Date: Wed, 4 Dec 2024 14:49:07 +0800 Subject: [PATCH 2/7] [Docs](mlu-ops): Update version date. (#1171) Co-authored-by: duzekun --- docs/api_guide/update.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api_guide/update.rst b/docs/api_guide/update.rst index 1d4b32913..3c60f6b86 100755 --- a/docs/api_guide/update.rst +++ b/docs/api_guide/update.rst @@ -5,7 +5,7 @@ This section lists contents that were made for each product release. * V1.4.0 - **Date:** October 21, 2024 + **Date:** November 29, 2024 **Changes:** From 59eae84eade1ed8f9928091e8ba5733c59086a84 Mon Sep 17 00:00:00 2001 From: PetrelYy <92866578+PetrelYy@users.noreply.github.com> Date: Wed, 4 Dec 2024 15:12:40 +0800 Subject: [PATCH 3/7] [Feature](mlu-ops): adapt scatter,gather (#1168) --- .../box_iou_rotated/box_iou_rotated_utils.h | 28 +++--- .../generate_proposals_v2_union1_500.mlu | 1 + .../ms_deform_attn_backward_fast_union1.mlu | 28 +++--- ...rm_attn_backward_small_channels_union1.mlu | 38 ++++---- .../ms_deform_attn_utils.h | 14 +-- .../msda_forward_fast_union1.mlu | 50 +++++------ .../roi_align_rotated_forward_vector.mlu | 10 +-- kernels/utils/scatter_gather.h | 90 +++++++++++++++++++ .../voxel_pooling_forward_union1.mlu | 10 ++- kernels/voxelization/voxelization_kernel.mlu | 20 +++-- 10 files changed, 191 insertions(+), 98 deletions(-) create mode 100644 kernels/utils/scatter_gather.h diff --git a/kernels/box_iou_rotated/box_iou_rotated_utils.h b/kernels/box_iou_rotated/box_iou_rotated_utils.h index 7c3e8d270..22aa3e0ec 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_utils.h +++ b/kernels/box_iou_rotated/box_iou_rotated_utils.h @@ -24,6 +24,7 @@ #define KERNELS_BOX_IOU_ROTATED_BOX_IOU_ROTATED_UTILS_H_ #include "kernels/utils/common.h" +#include "kernels/utils/scatter_gather.h" #define FIILED_ONES (int)0xffffffff #define HALF_FILLED_ONES (int16_t)0xffff @@ -590,21 +591,22 @@ __mlu_func__ void convexHullGraham( sizeof(T), actual_compute_box_num); // get the ordered points according to the angle value - __gather(ordered_pts_x + (i + 1) * actual_compute_box_num, intersect_pts_x, - (unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T), - actual_compute_box_num); - __gather(ordered_pts_y + (i + 1) * actual_compute_box_num, intersect_pts_y, - (unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T), - actual_compute_box_num); - __gather(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts, - (unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T), - actual_compute_box_num); + __mluop_gather(ordered_pts_x + (i + 1) * actual_compute_box_num, + intersect_pts_x, (unsigned int *)temp_offset, NULL, + sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num); + __mluop_gather(ordered_pts_y + (i + 1) * actual_compute_box_num, + intersect_pts_y, (unsigned int *)temp_offset, NULL, + sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num); + __mluop_gather(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts, + (unsigned int *)temp_offset, NULL, sizeof(T), NRAM2NRAM, + sizeof(T), actual_compute_box_num); // assign a invalid value to the point which has been get ordered - __scatter(temp_long_2, temp1_ram, (unsigned int *)temp_offset, sizeof(T), - NRAM2NRAM, sizeof(T), actual_compute_box_num); - __scatter(valid_pts, temp2_ram, (unsigned int *)temp_offset, sizeof(T), - NRAM2NRAM, sizeof(T), actual_compute_box_num); + __mluop_scatter(temp_long_2, temp1_ram, (unsigned int *)temp_offset, + NULL, sizeof(T), NRAM2NRAM, sizeof(T), + actual_compute_box_num); + __mluop_scatter(valid_pts, temp2_ram, (unsigned int *)temp_offset, NULL, + sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num); } __bang_move(valid_pts, temp_long_1, total_points * sizeof(T)); #else diff --git a/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu b/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu index bf5887b03..59e25153b 100644 --- a/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu +++ b/kernels/generate_proposals_v2/generate_proposals_v2_union1_500.mlu @@ -158,6 +158,7 @@ __mlu_func__ void proposalBoxesDecode( // gather offset (byte). __bang_mul_scalar(anchors_index_nram, anchors_index_nram, sizeof(int32_t), deal_num); + // deal_num <= 5163 __gather(temp_nram, anchors, (unsigned int *)anchors_index_nram, sizeof(T) * 4, GDRAM2NRAM, sizeof(T) * 4, deal_num); __bang_transpose(anchors_nram, temp_nram, deal_num, 4); diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu index b72087481..d94a2f021 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_fast_union1.mlu @@ -233,14 +233,14 @@ __mlu_func__ void backwardStageTwoLoop( for (int j = 0; j < 5; j++) { T* tmp_wp = weight_polation_nram + (j - 1) * nq_nl_np; if (j < 4) { - gatherAsync(v_ping, zeros_nram, (unsigned int*)offset_zero_nram_stg2, - bit_cond_reverse_nram + j * bit_cond_stride, - channels * sizeof(T), NRAM2NRAM, channels * sizeof(T), - nq_nl_np); - gatherAsync(v_ping, data_value_gdram, - (unsigned int*)offset_nram + j * nq_nl_np, - bit_cond_nram + j * bit_cond_stride, channels * sizeof(T), - GDRAM2NRAM, channels * sizeof(T), nq_nl_np); + gatherAsync(v_ping, zeros_nram, (unsigned int*)offset_zero_nram_stg2, + bit_cond_reverse_nram + j * bit_cond_stride, + channels * sizeof(T), NRAM2NRAM, channels * sizeof(T), + nq_nl_np); + gatherAsync( + v_ping, data_value_gdram, (unsigned int*)offset_nram + j * nq_nl_np, + bit_cond_nram + j * bit_cond_stride, channels * sizeof(T), + GDRAM2NRAM, channels * sizeof(T), nq_nl_np); } if (j == 0) { @@ -249,10 +249,10 @@ __mlu_func__ void backwardStageTwoLoop( NRAM2NRAM, channels * sizeof(T), num_levels_points - 1, num_levels_points * channels * sizeof(T), deal_n - 1, 0, num_levels_points - 1, channels * sizeof(T), deal_n - 1); - gatherAsync(buffer, zeros_nram, (unsigned int*)offset_zero_nram_stg2, - bit_cond_reverse_nram + 4 * bit_cond_stride, - channels * sizeof(T), NRAM2NRAM, channels * sizeof(T), - nq_nl_np); + gatherAsync(buffer, zeros_nram, (unsigned int*)offset_zero_nram_stg2, + bit_cond_reverse_nram + 4 * bit_cond_stride, + channels * sizeof(T), NRAM2NRAM, channels * sizeof(T), + nq_nl_np); __bang_write_value(value_wp, nq_nl_np_c, (T)0); // clear value*wp __sync_move(); // (n, nl, np, c) => (c, n, nl, np) @@ -332,7 +332,7 @@ __mlu_func__ void backwardStageTwoLoop( int32_t* dst_offset = (int32_t*)offset_zero_nram_stg2; for (int i = 0; i < 4; i++) { __bang_filter((T*)dst_offset + i * nq_nl_np, - (T*)offset_nram + i * nq_nl_np, cond_all_valid, nq_nl_np); + (T*)offset_nram + i * nq_nl_np, cond_all_valid, nq_nl_np); } int32_t* src_offset = (int32_t*)inter_grad; int32_t* stride_4_2 = dst_offset + 3 * nq_nl_np; @@ -368,7 +368,7 @@ __mlu_func__ void backwardStageTwoLoop( int32_t valid_count = __bang_sum(tmp_cond, nq_nl_np); if (valid_count > 0) { __bang_filter((T*)tmp_dst_offset, (T*)tmp_dst_offset, tmp_cond, - nq_nl_np); + nq_nl_np); __bang_filter((T*)tmp_src_offset, (T*)seq_nram, tmp_cond, nq_nl_np); __bang_mul_scalar(tmp_src_offset, tmp_src_offset, channels * sizeof(T), valid_count); diff --git a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu index 517c00a8c..9ff2a72e8 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_backward/ms_deform_attn_backward_small_channels_union1.mlu @@ -25,6 +25,7 @@ #include "core/logging.h" #include "kernels/kernel.h" #include "kernels/utils/common.h" +#include "kernels/utils/scatter_gather.h" __nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; @@ -313,24 +314,25 @@ void __mlu_func__ loadValue( sizeof(int32_t), 4 * num_deal_grid); __sync_io_move_compute(); - __gather_async((void *)nram_grad_output_tl, (void *)data_value, - (unsigned int *)grad_temp3, deal_num_real * sizeof(float), - GDRAM2NRAM, deal_num_real * sizeof(float), num_deal_grid); - - __gather_async((void *)nram_grad_output_tr, (void *)data_value, - (unsigned int *)(grad_temp3 + num_deal_grid), - deal_num_real * sizeof(float), GDRAM2NRAM, - deal_num_real * sizeof(float), num_deal_grid); - - __gather_async((void *)nram_grad_output_bl, (void *)data_value, - (unsigned int *)(grad_temp3 + 2 * num_deal_grid), - deal_num_real * sizeof(float), GDRAM2NRAM, - deal_num_real * sizeof(float), num_deal_grid); - - __gather_async((void *)nram_grad_output_br, (void *)data_value, - (unsigned int *)(grad_temp3 + 3 * num_deal_grid), - deal_num_real * sizeof(float), GDRAM2NRAM, - deal_num_real * sizeof(float), num_deal_grid); + __mluop_gather((float *)nram_grad_output_tl, (float *)data_value, + (unsigned int *)grad_temp3, NULL, + deal_num_real * sizeof(float), GDRAM2NRAM, + deal_num_real * sizeof(float), num_deal_grid); + + __mluop_gather((float *)nram_grad_output_tr, (float *)data_value, + (unsigned int *)(grad_temp3 + num_deal_grid), NULL, + deal_num_real * sizeof(float), GDRAM2NRAM, + deal_num_real * sizeof(float), num_deal_grid); + + __mluop_gather((float *)nram_grad_output_bl, (float *)data_value, + (unsigned int *)(grad_temp3 + 2 * num_deal_grid), NULL, + deal_num_real * sizeof(float), GDRAM2NRAM, + deal_num_real * sizeof(float), num_deal_grid); + + __mluop_gather((float *)nram_grad_output_br, (float *)data_value, + (unsigned int *)(grad_temp3 + 3 * num_deal_grid), NULL, + deal_num_real * sizeof(float), GDRAM2NRAM, + deal_num_real * sizeof(float), num_deal_grid); __sync_io_move_compute(); #else diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h index 0f4b4dd17..7ecb0b41f 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/ms_deform_attn_utils.h @@ -28,6 +28,7 @@ #include "kernels/kernel.h" #include "kernels/utils/common.h" +#include "kernels/utils/scatter_gather.h" #define BIT_COLLECT_PAD (8) #define BACKWARD_MAX_NQ_NL_NP (1024) @@ -377,19 +378,12 @@ __mlu_func__ void stageOneLoop( #endif #if (__BANG_ARCH__ >= 592) +template __mlu_func__ void gatherAsync(void* dst, void* src, unsigned int* offset, void* mask, int transfer_size, mluMemcpyDirection_t dir, int dst_stride, int transfer_num) { - __gather_async(dst, src, offset, mask, transfer_size, dir, dst_stride, - transfer_num); -} - -__mlu_func__ void gatherSync(void* dst, void* src, unsigned int* offset, - void* mask, int transfer_size, - mluMemcpyDirection_t dir, int dst_stride, - int transfer_num) { - __gather(dst, src, offset, mask, transfer_size, dir, dst_stride, - transfer_num); + __mluop_gather_async((T*)dst, (T*)src, offset, (uint8_t*)mask, + transfer_size, dir, dst_stride, transfer_num); } #endif diff --git a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu index 2d29981e2..a4c61a979 100644 --- a/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu +++ b/kernels/ms_deform_attn/ms_deform_attn_forward/msda_forward_fast_union1.mlu @@ -290,7 +290,7 @@ __mlu_func__ void getConditionCoordWeight( } __bang_mul_scalar(buf_nram, weight_attn_nram, (T)1, total_points); __bang_filter((float*)weight_attn_nram, (float*)buf_nram, - cond_point_valid_nram, total_points); + cond_point_valid_nram, total_points); __bang_float2int32((int32_t*)cond_point_polation_nram, cond_point_polation_nram, total_points * 4, 0); __bang_mul_scalar((int32_t*)cond_point_polation_nram, @@ -300,16 +300,16 @@ __mlu_func__ void getConditionCoordWeight( (int8_t*)cond_point_polation_nram, total_points * 4 * sizeof(float)); __bang_filter((float*)weight_polation_nram, (float*)weight_polation_nram_tmp, - cond_point_valid_nram, total_points); + cond_point_valid_nram, total_points); __bang_filter((float*)weight_polation_nram + total_points, - (float*)weight_polation_nram_tmp + total_points, - cond_point_valid_nram, total_points); + (float*)weight_polation_nram_tmp + total_points, + cond_point_valid_nram, total_points); __bang_filter((float*)weight_polation_nram + 2 * total_points, - (float*)weight_polation_nram_tmp + 2 * total_points, - cond_point_valid_nram, total_points); + (float*)weight_polation_nram_tmp + 2 * total_points, + cond_point_valid_nram, total_points); __bang_filter((float*)weight_polation_nram + 3 * total_points, - (float*)weight_polation_nram_tmp + 3 * total_points, - cond_point_valid_nram, total_points); + (float*)weight_polation_nram_tmp + 3 * total_points, + cond_point_valid_nram, total_points); //================================================================================================ // select cond_point_polation_nram if value_contain_infnan if (value_contain_infnan) { @@ -318,17 +318,17 @@ __mlu_func__ void getConditionCoordWeight( (int32_t*)cond_point_polation_nram, (int32_t)1, total_points * 4); __bang_filter((float*)cond_point_polation_nram, - (float*)cond_point_polation_nram_tmp, cond_point_valid_nram, - total_points); + (float*)cond_point_polation_nram_tmp, cond_point_valid_nram, + total_points); __bang_filter((float*)cond_point_polation_nram + total_points, - (float*)cond_point_polation_nram_tmp + total_points, - cond_point_valid_nram, total_points); + (float*)cond_point_polation_nram_tmp + total_points, + cond_point_valid_nram, total_points); __bang_filter((float*)cond_point_polation_nram + 2 * total_points, - (float*)cond_point_polation_nram_tmp + 2 * total_points, - cond_point_valid_nram, total_points); + (float*)cond_point_polation_nram_tmp + 2 * total_points, + cond_point_valid_nram, total_points); __bang_filter((float*)cond_point_polation_nram + 3 * total_points, - (float*)cond_point_polation_nram_tmp + 3 * total_points, - cond_point_valid_nram, total_points); + (float*)cond_point_polation_nram_tmp + 3 * total_points, + cond_point_valid_nram, total_points); } //================================================================================================ // compute and select offset and stride @@ -348,11 +348,11 @@ __mlu_func__ void getConditionCoordWeight( (int32_t*)data_offset_nram_tr_tmp, (int32_t*)data_offset_nram_tl_tmp, total_points); __bang_filter((float*)data_offset_nram_tl, (float*)data_offset_nram_tl_tmp, - cond_point_valid_nram, total_points); + cond_point_valid_nram, total_points); __bang_filter((float*)data_offset_nram_bl, (float*)data_offset_nram_bl_tmp, - cond_point_valid_nram, total_points); + cond_point_valid_nram, total_points); __bang_filter((float*)data_offset_nram_tr, (float*)data_offset_nram_tr_tmp, - cond_point_valid_nram, total_points); + cond_point_valid_nram, total_points); } /* @@ -1068,12 +1068,12 @@ __mlu_func__ void forwardStageTwoLoop( __sync_io_move_compute(); if (i < loop_num) { - gatherAsync(v_load, zeros_nram, (unsigned int*)offset_zero_nram_stg2, - cond_nram_stg2_reverse, channels * sizeof(T), NRAM2NRAM, - channels * sizeof(T), load_point_num); - gatherAsync(v_load, data_value_gdram, (unsigned int*)offset_nram_stg2, - cond_nram_stg2, channels * sizeof(T), GDRAM2NRAM, - channels * sizeof(T), load_point_num); + gatherAsync(v_load, zeros_nram, (unsigned int*)offset_zero_nram_stg2, + cond_nram_stg2_reverse, channels * sizeof(T), NRAM2NRAM, + channels * sizeof(T), load_point_num); + gatherAsync(v_load, data_value_gdram, (unsigned int*)offset_nram_stg2, + cond_nram_stg2, channels * sizeof(T), GDRAM2NRAM, + channels * sizeof(T), load_point_num); } if (i > 0) { diff --git a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu index d226df82c..e8c545e04 100644 --- a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu +++ b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu @@ -38,14 +38,12 @@ __mlu_func__ void mluopDivScalar(T *dst, T *src, T value, uint32_t num) { __asm__ volatile( "div.scalar.nram.f16 [%[dst]], [%[src0]], " "%[src1], %[num];\n\t" ::[dst] "r"(dst), - [ src0 ] "r"(src), [ src1 ] "r"(value), - [ num ] "r"(num)); + [ src0 ] "r"(src), [ src1 ] "r"(value), [ num ] "r"(num)); } else { __asm__ volatile( "div.scalar.nram.f32 [%[dst]], [%[src0]], " "%[src1], %[num];\n\t" ::[dst] "r"(dst), - [ src0 ] "r"(src), [ src1 ] "r"(value), - [ num ] "r"(num)); + [ src0 ] "r"(src), [ src1 ] "r"(value), [ num ] "r"(num)); } } @@ -314,6 +312,7 @@ __mlu_func__ void handleChannels(const T *input, uint32_t deal_channels, } uint32_t hwc_num = deal_channels * vec_num; + // vec_num <= 1024 __gather(val, input, pos, deal_channels * sizeof(T), GDRAM2NRAM, deal_channels * sizeof(T), vec_num); if (deal_channels != 1) { @@ -521,8 +520,7 @@ __mlu_global__ void roiAlignRotatedForward( } } } - mluopDivScalar(output_channels, output_channels, (T)count, - cur_cache_c); + mluopDivScalar(output_channels, output_channels, (T)count, cur_cache_c); __memcpy(output_dram + bin_i * channels + c_cache_i, output_channels, cur_cache_c * sizeof(T), NRAM2GDRAM); } diff --git a/kernels/utils/scatter_gather.h b/kernels/utils/scatter_gather.h new file mode 100644 index 000000000..729fd9c9d --- /dev/null +++ b/kernels/utils/scatter_gather.h @@ -0,0 +1,90 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#include "kernels/kernel.h" + +#define SCATTER_GATHER_PARAMS \ + T *dst, const T *src, const uint32_t *offset, const uint8_t *mask, \ + const uint32_t transfer_size, const mluMemcpyDirection_t dir, \ + const uint32_t stride, const uint32_t data_num + +#if __BANG_ARCH__ > 592 +#define MLUOP_SCATTER_GATHER(func, is_scatter) \ + template \ + __mlu_func__ void __mluop_##func(SCATTER_GATHER_PARAMS) { \ + if (data_num <= UINT16_MAX) { \ + if (mask) { \ + __##func(dst, src, offset, (const void *)mask, transfer_size, dir, \ + stride, data_num); \ + } else { \ + __##func(dst, src, offset, transfer_size, dir, stride, data_num); \ + } \ + } else { \ + uint16_t data_num_new = PAD_DOWN(UINT16_MAX, 64); \ + uint32_t remain = data_num % data_num_new; \ + uint32_t repeat = data_num / data_num_new + uint32_t(remain > 0); \ + uint32_t dst_offset = is_scatter ? 0 : data_num_new; \ + uint32_t src_offset = is_scatter ? data_num_new : 0; \ + \ + for (uint32_t i = 0; i <= repeat; ++i) { \ + const uint16_t data_num_loop = i < repeat ? data_num_new : remain; \ + if (mask) { \ + __##func(dst + i * dst_offset, src + i * src_offset, \ + mask + i * (data_num_new / 8), offset + i * data_num_new, \ + transfer_size, dir, stride, data_num_loop); \ + } else { \ + __##func(dst + i * dst_offset, src + i * src_offset, \ + offset + i * data_num_new, transfer_size, dir, stride, \ + data_num_loop); \ + } \ + } \ + } \ + } + +/* __mlu_op_scatter + * __mlu_op_scatter_async + * __mlu_op_gather + * __mlu_op_gather_async + */ +MLUOP_SCATTER_GATHER(gather_async, false) +MLUOP_SCATTER_GATHER(gather, false) +MLUOP_SCATTER_GATHER(scatter_async, true) +MLUOP_SCATTER_GATHER(scatter, true) + +#elif __BANG_ARCH__ == 592 +#define MLUOP_SCATTER_GATHER(func) \ + template \ + __mlu_func__ void __mluop_##func(SCATTER_GATHER_PARAMS) { \ + if (mask) { \ + __##func(dst, src, offset, mask, transfer_size, dir, stride, data_num); \ + } else { \ + __##func(dst, src, offset, transfer_size, dir, stride, data_num); \ + } \ + } + +MLUOP_SCATTER_GATHER(gather_async) +MLUOP_SCATTER_GATHER(gather) +MLUOP_SCATTER_GATHER(scatter_async) +MLUOP_SCATTER_GATHER(scatter) + +#endif // __BANG_ARCH__ > 592 diff --git a/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu b/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu index 90ecc8363..a7ff5fbb8 100644 --- a/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu +++ b/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu @@ -25,6 +25,7 @@ #include "core/logging.h" #include "kernels/kernel.h" #include "kernels/utils/common.h" +#include "kernels/utils/scatter_gather.h" __nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; @@ -392,10 +393,11 @@ __mlu_func__ void MLUKernelVoxelPoolingStageTwoPerfKernel( __bang_ge_bitindex((float *)gather_mask, (float *)nram_geom + point_idx_offset, (float *)nram_geom_x, align_8_deal_num); - __gather((float *)gather_src, (float *)input_features, - (unsigned int *)gather_offset + point_idx_offset, - (void *)gather_mask, num_channels * sizeof(float), GDRAM2NRAM, - num_channels * sizeof(float), actual_load_num); + __mluop_gather((float *)gather_src, (float *)input_features, + (unsigned int *)gather_offset + point_idx_offset, + (uint8_t *)gather_mask, + num_channels * sizeof(float), GDRAM2NRAM, + num_channels * sizeof(float), actual_load_num); for (int index = 0; index < actual_load_num; index++) { int output_features_pt_offset = nram_geom[point_idx_offset + index]; if (output_features_pt_offset >= 0) { diff --git a/kernels/voxelization/voxelization_kernel.mlu b/kernels/voxelization/voxelization_kernel.mlu index 04f5580e7..9832ab4bf 100644 --- a/kernels/voxelization/voxelization_kernel.mlu +++ b/kernels/voxelization/voxelization_kernel.mlu @@ -28,6 +28,7 @@ #include "core/logging.h" #include "kernels/kernel.h" #include "kernels/utils/common.h" +#include "kernels/utils/scatter_gather.h" __nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; @@ -547,9 +548,10 @@ __mlu_global__ void mluCalcPointsPerVoxel( // compute scatter src: voxel_idx __bang_add_scalar(nram_temp_mask, nram_base_offset, voxel_num_temp, deal_num); - __scatter(nram_scatter_output, nram_temp_mask, - (unsigned int *)nram_scatter_offset, nram_mask_bitindex, - sizeof(int32_t), NRAM2NRAM, sizeof(int32_t), reserve_voxels); + __mluop_scatter(nram_scatter_output, nram_temp_mask, + (unsigned int *)nram_scatter_offset, + (uint8_t *)nram_mask_bitindex, sizeof(int32_t), + NRAM2NRAM, sizeof(int32_t), reserve_voxels); __memcpy(num_points_per_voxel + voxel_num_temp, nram_scatter_mask, reserve_voxels * sizeof(int32_t), NRAM2GDRAM); voxel_num_temp += reserve_voxels; @@ -568,8 +570,9 @@ __mlu_global__ void mluCalcPointsPerVoxel( if (count > 0) { __bang_mul_scalar(nram_p2p_idx, nram_p2p_idx, sizeof(int32_t), count); // get repeated point real point_id - __gather(gather_output, coor_to_voxelidx, (unsigned int *)nram_p2p_idx, - sizeof(int32_t), GDRAM2NRAM, sizeof(int32_t), count); + __mluop_gather( + gather_output, coor_to_voxelidx, (unsigned int *)nram_p2p_idx, NULL, + sizeof(int32_t), GDRAM2NRAM, sizeof(int32_t), count); __bang_eq_scalar(nram_scatter_mask, gather_output, -1, count); __bang_not(nram_scatter_mask, nram_scatter_mask, count); __bang_gt_bitindex((float *)nram_mask_bitindex, @@ -582,9 +585,10 @@ __mlu_global__ void mluCalcPointsPerVoxel( gather_mask, deal_num); __bang_mul_scalar(nram_temp_mask, nram_temp_mask, sizeof(int32_t), deal_num); - __scatter(coor_to_voxelidx, gather_output, - (unsigned int *)nram_temp_mask, nram_mask_bitindex, - sizeof(int32_t), NRAM2GDRAM, sizeof(int32_t), count); + __mluop_scatter(coor_to_voxelidx, gather_output, + (unsigned int *)nram_temp_mask, + (uint8_t *)nram_mask_bitindex, sizeof(int32_t), + NRAM2GDRAM, sizeof(int32_t), count); // step4: compute num_points_per_voxel for (int32_t i = 0; i < count; i++) { From 662a162aa448ff1b800f2a109c83a6823ad7b4f8 Mon Sep 17 00:00:00 2001 From: niyuming Date: Fri, 6 Dec 2024 15:37:44 +0800 Subject: [PATCH 4/7] [Fix](mluOpExecFFT): fix core dump, scale factor and one point compute error (#1159) Co-authored-by: niyuming --- kernels/fft/c2c_fft/c2c_fft_host.cpp | 260 ++++++------ kernels/fft/common/fft_basic_ops.cpp | 16 +- kernels/fft/common/fft_common_kernels.mlu | 3 +- kernels/fft/fft.cpp | 114 ++++-- kernels/fft/fft.h | 30 +- .../fft_optm_device/fft_c2c_stockham_nram.h | 373 ------------------ .../fft_two-level_network_c2c_device.mlu | 48 +-- .../fft_two-level_network_c2r_device.mlu | 24 +- .../fft_two-level_network_r2c_device.mlu | 26 +- kernels/fft/irfft/irfft_host.cpp | 245 ++++++------ kernels/fft/rfft/rfft_host.cpp | 268 +++++++------ .../tensor_stride_process_host.cpp | 3 +- 12 files changed, 561 insertions(+), 849 deletions(-) diff --git a/kernels/fft/c2c_fft/c2c_fft_host.cpp b/kernels/fft/c2c_fft/c2c_fft_host.cpp index 29c53d61f..0f7fc3a6f 100644 --- a/kernels/fft/c2c_fft/c2c_fft_host.cpp +++ b/kernels/fft/c2c_fft/c2c_fft_host.cpp @@ -648,13 +648,13 @@ mluOpStatus_t setFFT2dReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, } } - int _n0 = fft_plan->n[0]; - int _n1 = fft_plan->n[1]; + int n0_ori = fft_plan->n[0]; + int n1_ori = fft_plan->n[1]; if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) { size_t factors_size = FFT_MAXFACTORS * sizeof(int); // bytes - size_t twiddles_size = CPX_TYPE_SIZE * _n1; - size_t twiddles_size_2d = CPX_TYPE_SIZE * _n0; + size_t twiddles_size = CPX_TYPE_SIZE * n1_ori; + size_t twiddles_size_2d = CPX_TYPE_SIZE * n0_ori; size_t reservespace_offset = 0; @@ -794,19 +794,20 @@ mluOpStatus_t setFFT2dReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, size_t reservespace_offset = 0; fft_plan->mlu_addrs.dft_matrix = (uint8_t *)fft_plan->reservespace_addr + reservespace_offset; - reservespace_offset += CPX_TYPE_SIZE * (_n1 / 2 + 1) * _n1; + reservespace_offset += CPX_TYPE_SIZE * (n1_ori / 2 + 1) * n1_ori; fft_plan->mlu_addrs.dft_matrix_2d = (uint8_t *)fft_plan->reservespace_addr + reservespace_offset; - reservespace_offset += CPX_TYPE_SIZE * _n0 * _n0; + reservespace_offset += CPX_TYPE_SIZE * n0_ori * n0_ori; CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix, fft_plan->dft_matrix, - CPX_TYPE_SIZE * (_n1 / 2 + 1) * _n1, + CPX_TYPE_SIZE * (n1_ori / 2 + 1) * n1_ori, + handle->queue, cnrtMemcpyHostToDev)); + CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix_2d, + fft_plan->dft_matrix_2d, + CPX_TYPE_SIZE * n0_ori * n0_ori, handle->queue, cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpyAsync( - fft_plan->mlu_addrs.dft_matrix_2d, fft_plan->dft_matrix_2d, - CPX_TYPE_SIZE * _n0 * _n0, handle->queue, cnrtMemcpyHostToDev)); } break; case CNFFT_COMPLEX_HALF2COMPLEX_HALF: case CNFFT_COMPLEX_FLOAT2COMPLEX_FLOAT: { @@ -814,34 +815,38 @@ mluOpStatus_t setFFT2dReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, size_t reservespace_offset = 0; fft_plan->mlu_addrs.dft_matrix = (uint8_t *)fft_plan->reservespace_addr + reservespace_offset; - reservespace_offset += CPX_TYPE_SIZE * _n1 * _n1; + reservespace_offset += CPX_TYPE_SIZE * n1_ori * n1_ori; fft_plan->mlu_addrs.dft_matrix_2d = (uint8_t *)fft_plan->reservespace_addr + reservespace_offset; - reservespace_offset += CPX_TYPE_SIZE * _n0 * _n0; + reservespace_offset += CPX_TYPE_SIZE * n0_ori * n0_ori; fft_plan->mlu_addrs.idft_matrix = (uint8_t *)fft_plan->reservespace_addr + reservespace_offset; - reservespace_offset += CPX_TYPE_SIZE * _n1 * _n1; + reservespace_offset += CPX_TYPE_SIZE * n1_ori * n1_ori; fft_plan->mlu_addrs.idft_matrix_2d = (uint8_t *)fft_plan->reservespace_addr + reservespace_offset; - reservespace_offset += CPX_TYPE_SIZE * _n0 * _n0; - - CNRT_CHECK(cnrtMemcpyAsync( - fft_plan->mlu_addrs.dft_matrix, fft_plan->dft_matrix, - CPX_TYPE_SIZE * _n1 * _n1, handle->queue, cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpyAsync( - fft_plan->mlu_addrs.dft_matrix_2d, fft_plan->dft_matrix_2d, - - CPX_TYPE_SIZE * _n0 * _n0, handle->queue, cnrtMemcpyHostToDev)); - - CNRT_CHECK(cnrtMemcpyAsync( - fft_plan->mlu_addrs.idft_matrix, fft_plan->idft_matrix, - CPX_TYPE_SIZE * _n1 * _n1, handle->queue, cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpyAsync( - fft_plan->mlu_addrs.idft_matrix_2d, fft_plan->idft_matrix_2d, - CPX_TYPE_SIZE * _n0 * _n0, handle->queue, cnrtMemcpyHostToDev)); + reservespace_offset += CPX_TYPE_SIZE * n0_ori * n0_ori; + + CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix, + fft_plan->dft_matrix, + CPX_TYPE_SIZE * n1_ori * n1_ori, + handle->queue, cnrtMemcpyHostToDev)); + CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix_2d, + fft_plan->dft_matrix_2d, + + CPX_TYPE_SIZE * n0_ori * n0_ori, + handle->queue, cnrtMemcpyHostToDev)); + + CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.idft_matrix, + fft_plan->idft_matrix, + CPX_TYPE_SIZE * n1_ori * n1_ori, + handle->queue, cnrtMemcpyHostToDev)); + CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.idft_matrix_2d, + fft_plan->idft_matrix_2d, + CPX_TYPE_SIZE * n0_ori * n0_ori, + handle->queue, cnrtMemcpyHostToDev)); }; break; case CNFFT_COMPLEX_HALF2HALF: case CNFFT_COMPLEX_FLOAT2FLOAT: { @@ -849,19 +854,20 @@ mluOpStatus_t setFFT2dReserveArea(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, size_t reservespace_offset = 0; fft_plan->mlu_addrs.dft_matrix = (uint8_t *)fft_plan->reservespace_addr + reservespace_offset; - reservespace_offset += CPX_TYPE_SIZE * (_n1 / 2 + 1) * _n1; + reservespace_offset += CPX_TYPE_SIZE * (n1_ori / 2 + 1) * n1_ori; fft_plan->mlu_addrs.dft_matrix_2d = (uint8_t *)fft_plan->reservespace_addr + reservespace_offset; - reservespace_offset += CPX_TYPE_SIZE * _n0 * _n0; + reservespace_offset += CPX_TYPE_SIZE * n0_ori * n0_ori; CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix, fft_plan->dft_matrix, - CPX_TYPE_SIZE * (_n1 / 2 + 1) * _n1, + CPX_TYPE_SIZE * (n1_ori / 2 + 1) * n1_ori, + handle->queue, cnrtMemcpyHostToDev)); + CNRT_CHECK(cnrtMemcpyAsync(fft_plan->mlu_addrs.dft_matrix_2d, + fft_plan->dft_matrix_2d, + CPX_TYPE_SIZE * n0_ori * n0_ori, handle->queue, cnrtMemcpyHostToDev)); - CNRT_CHECK(cnrtMemcpyAsync( - fft_plan->mlu_addrs.dft_matrix_2d, fft_plan->dft_matrix_2d, - CPX_TYPE_SIZE * _n0 * _n0, handle->queue, cnrtMemcpyHostToDev)); }; break; default: { LOG(ERROR) << make_plan_api << ": invalid 2d fft type."; @@ -1060,13 +1066,13 @@ static void configureFFT2dWorkspaceAddrs(mluOpHandle_t handle, size_t out_c_dtype_size = mluOpDataTypeBytes(out_c_dtype); int batch = fft_plan->batch; - int _n0 = fft_plan->n[0]; - int _n1 = fft_plan->n[1]; + int n0_ori = fft_plan->n[0]; + int n1_ori = fft_plan->n[1]; size_t offset = 0; if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { // rr ri ir ii - size_t buffer_size = batch * in_c_dtype_size * _n0 * _n1 * 2; + size_t buffer_size = batch * in_c_dtype_size * n0_ori * n1_ori * 2; fft_plan->mlu_addrs.input = input; fft_plan->mlu_addrs.output = output; fft_plan->mlu_addrs.buffer_in = (uint8_t *)workspace + offset; @@ -1077,27 +1083,29 @@ static void configureFFT2dWorkspaceAddrs(mluOpHandle_t handle, if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) { fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset; - offset += batch * in_c_dtype_size * _n0 * _n1 * 2; + offset += batch * in_c_dtype_size * n0_ori * n1_ori * 2; - if (fft_plan->is_input_contiguous) { + if ((fft_plan->is_input_contiguous && + fft_plan->inembed[0] <= fft_plan->n[0] && + fft_plan->inembed[1] <= fft_plan->n[1])) { fft_plan->mlu_addrs.input = input; } else { fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset; - offset += batch * in_c_dtype_size * _n0 * _n1; + offset += batch * in_c_dtype_size * n0_ori * n1_ori; } if (fft_plan->is_output_contiguous) { fft_plan->mlu_addrs.output = output; } else { fft_plan->mlu_addrs.output = (uint8_t *)workspace + offset; - offset += batch * in_c_dtype_size * _n0 * _n1; + offset += batch * in_c_dtype_size * n0_ori * n1_ori; } } if (fft_plan->n[0] > fft_plan->inembed[0] || fft_plan->n[1] > fft_plan->inembed[1]) { fft_plan->mlu_addrs.input_pad_addr = - (uint8_t *)workspace + - offset; // batch * in_c_dtype_size * _n0 * _n1 * 2; // buffer_size; + (uint8_t *)workspace + offset; // batch * in_c_dtype_size * n0_ori * + // n1_ori * 2; // buffer_size; } } // input : in input @@ -1115,11 +1123,11 @@ static mluOpStatus_t makeFFT1dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 2; - int64_t dims[in_dim_num] = {fft_plan->batch, fft_plan->inembed[0]}; - int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride}; + const int IN_DIM_NUM = 2; + int64_t dims[IN_DIM_NUM] = {fft_plan->batch, fft_plan->inembed[0]}; + int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride}; status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->input_dtype, in_dim_num, + fft_plan->input_dtype, IN_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); @@ -1140,12 +1148,12 @@ static mluOpStatus_t makeFFT1dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 2; - int64_t dims[in_dim_num] = { + const int IN_DIM_NUM = 2; + int64_t dims[IN_DIM_NUM] = { fft_plan->batch, std::min(fft_plan->n[0], fft_plan->inembed[0])}; - int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride}; + int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride}; status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->input_dtype, in_dim_num, + fft_plan->input_dtype, IN_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); @@ -1176,15 +1184,17 @@ static mluOpStatus_t makeFFT2dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 3; - int64_t dims[in_dim_num] = {fft_plan->batch, + const int IN_DIM_NUM = 3; + int64_t dims[IN_DIM_NUM] = {fft_plan->batch, std::min(fft_plan->n[0], fft_plan->inembed[0]), std::min(fft_plan->n[1], fft_plan->inembed[1])}; - int64_t strides[in_dim_num] = {fft_plan->idist, - (fft_plan->istride * fft_plan->inembed[1]), - fft_plan->istride}; + + int64_t strides[IN_DIM_NUM]; // IN_DIM_NUM + for (int i = 0; i < IN_DIM_NUM; i++) { + strides[i] = fft_plan->in_stride[i]; + } status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->input_dtype, in_dim_num, + fft_plan->input_dtype, IN_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); @@ -1220,15 +1230,15 @@ static mluOpStatus_t padFFT1dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&padded_input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 2; - int64_t dims[in_dim_num] = {batch, fft_plan->inembed[0] * COMPLEX}; + const int IN_DIM_NUM = 2; + int64_t dims[IN_DIM_NUM] = {batch, fft_plan->inembed[0] * COMPLEX}; status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, dims); + in_r_dtype, IN_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - int64_t padded_dims[in_dim_num] = {batch, n * COMPLEX}; + int64_t padded_dims[IN_DIM_NUM] = {batch, n * COMPLEX}; status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, padded_dims); + in_r_dtype, IN_DIM_NUM, padded_dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); const int pad_dim_num = 4; @@ -1284,16 +1294,16 @@ static mluOpStatus_t padFFT2dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&padded_input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 3; - int64_t dims[in_dim_num] = {batch, std::min(fft_plan->inembed[0], n0), + const int IN_DIM_NUM = 3; + int64_t dims[IN_DIM_NUM] = {batch, std::min(fft_plan->inembed[0], n0), std::min(fft_plan->inembed[1], n1) * COMPLEX}; status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, dims); + in_r_dtype, IN_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - int64_t padded_dims[in_dim_num] = {batch, n0, n1 * COMPLEX}; + int64_t padded_dims[IN_DIM_NUM] = {batch, n0, n1 * COMPLEX}; status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, padded_dims); + in_r_dtype, IN_DIM_NUM, padded_dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); const int pad_dim_num = 6; @@ -1756,17 +1766,17 @@ static mluOpStatus_t makeFFT1dContiguousOutput(mluOpHandle_t handle, INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // set up tensor desc - const int out_dim_num = 2; - int64_t dims[out_dim_num] = {fft_plan->batch, (fft_plan->prime) + const int OUT_DIM_NUM = 2; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, (fft_plan->prime) ? fft_plan->onembed[0] : fft_plan->n[0]}; - int64_t strides[out_dim_num] = {fft_plan->odist, fft_plan->ostride}; + int64_t strides[OUT_DIM_NUM] = {fft_plan->odist, fft_plan->ostride}; status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims); + out_c_dtype, OUT_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims, strides); + out_c_dtype, OUT_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); void *copy_src_addr = (fft_plan->prime) @@ -1779,17 +1789,8 @@ static mluOpStatus_t makeFFT1dContiguousOutput(mluOpHandle_t handle, cnnl_copy_src_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc, cnnl_copy_dst_desc); - size_t workspace_size = 0; - CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc, - cnnl_copy_dst_desc, &workspace_size)); - - void *workspace = nullptr; - if (workspace_size > 0) { - CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); - } CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr, - cnnl_copy_dst_desc, output, workspace, - workspace_size)); + cnnl_copy_dst_desc, output, NULL, 0)); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc); DESTROY_CNNL_HANDLE(cnnl_handle); @@ -1815,18 +1816,19 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle, INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // set up tensor desc - const int out_dim_num = 3; - int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0], + const int OUT_DIM_NUM = 3; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0], fft_plan->n[1]}; - int64_t strides[out_dim_num] = {fft_plan->odist, - fft_plan->ostride * fft_plan->onembed[1], - fft_plan->ostride}; + int64_t strides[OUT_DIM_NUM]; // OUT_DIM_NUM + for (int i = 0; i < OUT_DIM_NUM; i++) { + strides[i] = fft_plan->out_stride[i]; + } status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims); + out_c_dtype, OUT_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims, strides); + out_c_dtype, OUT_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // void *copy_src_addr = fft_plan->matmul_addrs.output_contiguous_addr; @@ -1838,18 +1840,8 @@ static mluOpStatus_t makeFFT2dContiguousOutput(mluOpHandle_t handle, cnnl_copy_src_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc, cnnl_copy_dst_desc); - - size_t workspace_size = 0; - CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc, - cnnl_copy_dst_desc, &workspace_size)); - - void *workspace = nullptr; - if (workspace_size > 0) { - CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); - } CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr, - cnnl_copy_dst_desc, output, workspace, - workspace_size)); + cnnl_copy_dst_desc, output, NULL, 0)); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc); @@ -2003,12 +1995,16 @@ mluOpStatus_t execFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, const float beta[2] = {0.0, 0.0}; mluOpTensorDescriptor_t c_desc = nullptr; status = mluOpCreateTensorDescriptor(&c_desc); - const int out_dim_num = 2; - int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0]}; + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + const int OUT_DIM_NUM = 2; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0]}; status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->output_dtype, 2, dims); + fft_plan->output_dtype, OUT_DIM_NUM, + dims); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorOnchipDataType( c_desc, fft_plan->execution_dtype); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); // convert to cnnl_handle @@ -2019,6 +2015,8 @@ mluOpStatus_t execFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, cnnl_output_desc, fft_plan->mlu_addrs.output, &beta, cnnl_output_desc, fft_plan->mlu_addrs.output)); + status = mluOpDestroyTensorDescriptor(c_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); DESTROY_CNNL_HANDLE(cnnl_handle); } @@ -2053,7 +2051,34 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr; } - status = execFFTc2c2d(handle, fft_plan, scale_factor, direction); + if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) { + mluOpTensorDescriptor_t c_desc = nullptr; + status = mluOpCreateTensorDescriptor(&c_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + const int OUT_DIM_NUM = 3; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0], + fft_plan->n[1]}; + status = mluOpSetTensorDescriptor_v2( + c_desc, MLUOP_LAYOUT_ARRAY, fft_plan->output_dtype, OUT_DIM_NUM, dims); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + status = mluOpSetTensorDescriptorOnchipDataType(c_desc, + fft_plan->execution_dtype); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, + cnnl_handle); // convert to cnnl_handle + + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc); + CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_output_desc, + fft_plan->mlu_addrs.input, cnnl_output_desc, + fft_plan->mlu_addrs.output, NULL, 0)); + status = mluOpDestroyTensorDescriptor(c_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); + DESTROY_CNNL_HANDLE(cnnl_handle); + } else { + status = execFFTc2c2d(handle, fft_plan, scale_factor, direction); + } INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); @@ -2062,13 +2087,16 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, const float beta[2] = {0.0, 0.0}; mluOpTensorDescriptor_t c_desc = nullptr; status = mluOpCreateTensorDescriptor(&c_desc); - const int out_dim_num = 3; - int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0], + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + const int OUT_DIM_NUM = 3; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0], fft_plan->n[1]}; - status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->output_dtype, 3, dims); + status = mluOpSetTensorDescriptor_v2( + c_desc, MLUOP_LAYOUT_ARRAY, fft_plan->output_dtype, OUT_DIM_NUM, dims); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorOnchipDataType(c_desc, fft_plan->execution_dtype); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); // convert to cnnl_handle @@ -2079,6 +2107,8 @@ mluOpStatus_t execFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, cnnl_output_desc, fft_plan->mlu_addrs.output, &beta, cnnl_output_desc, fft_plan->mlu_addrs.output)); + status = mluOpDestroyTensorDescriptor(c_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); DESTROY_CNNL_HANDLE(cnnl_handle); } @@ -2296,11 +2326,11 @@ mluOpStatus_t computeFFT2dMatMulRow(mluOpHandle_t handle, int requested_algo_count = 1, return_algo_count = 0; float *workspace; size_t workspace_size; - cnnlGetBatchMatMulAlgoHeuristic( + cnnlGetBatchMatMulExAlgoHeuristic( cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL, requested_algo_count, &heuristic_result, &return_algo_count); - cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size); + cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size); if (workspace_size > 0) { CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); @@ -2308,10 +2338,10 @@ mluOpStatus_t computeFFT2dMatMulRow(mluOpHandle_t handle, CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float))); } - CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha, - cnnl_a_desc, dft_matrix_addr, cnnl_b_desc, - in_addr, &beta, cnnl_c_desc, out_addr, - (void *)workspace, workspace_size)); + CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha, + cnnl_a_desc, dft_matrix_addr, cnnl_b_desc, + in_addr, &beta, cnnl_c_desc, out_addr, + (void *)workspace, workspace_size)); // destroy cnnl descriptor DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc); diff --git a/kernels/fft/common/fft_basic_ops.cpp b/kernels/fft/common/fft_basic_ops.cpp index b928cfe13..39eae3b71 100644 --- a/kernels/fft/common/fft_basic_ops.cpp +++ b/kernels/fft/common/fft_basic_ops.cpp @@ -495,10 +495,10 @@ mluOpStatus_t fftGetBatchMatMulBcastWorkspaceSize( cnnlMatMulHeuristicResult_t heuristic_result; CALL_CNNL(cnnlCreateMatMulHeuristicResult(&heuristic_result)); int requested_algo_count = 1, return_algo_count = 0; - cnnlGetBatchMatMulAlgoHeuristic( + cnnlGetBatchMatMulExAlgoHeuristic( cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL, requested_algo_count, &heuristic_result, &return_algo_count); - cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size); + cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size); // destroy descriptor // destroy cnnl descriptor DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); @@ -595,20 +595,20 @@ mluOpStatus_t fftBatchMatMulBcast( alpha = 1.0; beta = 0.0; int requested_algo_count = 1, return_algo_count = 0; - cnnlGetBatchMatMulAlgoHeuristic( + cnnlGetBatchMatMulExAlgoHeuristic( cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL, requested_algo_count, &heuristic_result, &return_algo_count); - cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size); + cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size); if (workspace_size > 0) { CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); } else { CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float))); } - CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha, - cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr, - &beta, cnnl_c_desc, c_ptr, - (void *)workspace, workspace_size)); + CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha, + cnnl_a_desc, a_ptr, cnnl_b_desc, b_ptr, + &beta, cnnl_c_desc, c_ptr, + (void *)workspace, workspace_size)); // destroy descriptor // destroy cnnl descriptor DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); diff --git a/kernels/fft/common/fft_common_kernels.mlu b/kernels/fft/common/fft_common_kernels.mlu index 8cca3a697..d9e48157d 100644 --- a/kernels/fft/common/fft_common_kernels.mlu +++ b/kernels/fft/common/fft_common_kernels.mlu @@ -109,7 +109,8 @@ __mlu_func__ void selectVec(float *src_addr, int32_t *offset_int_addr, __asm__ volatile( "gather.clean.nram.nram.nram.b32.u32 " "[%[dst]], [%[src]], [%[offset]], %[data_num];\n\t" ::[dst] "r"(dst_addr), - [src] "r"(src_addr), [offset] "r"(offset_int_addr), [data_num] "r"(deal_size)); + [ src ] "r"(src_addr), [ offset ] "r"(offset_int_addr), + [ data_num ] "r"(deal_size)); #else for (auto i = 0; i < deal_size; i++) { dst_addr[i] = src_addr[offset_int_addr[i]]; diff --git a/kernels/fft/fft.cpp b/kernels/fft/fft.cpp index 4d4ab9ef1..8886e0453 100644 --- a/kernels/fft/fft.cpp +++ b/kernels/fft/fft.cpp @@ -1657,7 +1657,7 @@ mluOpAllocateC2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, fft_plan->is_batch_contiguous) ? 0 : buffer_size; - if (fft_plan->n[0] > fft_plan->inembed[0]) { + if (fft_plan->n[0] != fft_plan->inembed[0]) { workspace_size += buffer_size; } size_t twiddles_size = in_c_dtype_size * nfft * 2; @@ -1701,7 +1701,7 @@ mluOpAllocateR2C1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */ + twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */ - if (fft_plan->n[0] > fft_plan->inembed[0]) { + if (fft_plan->n[0] != fft_plan->inembed[0]) { workspace_size += buffer_size; // input_pad_addr } fft_plan->workspace_size = workspace_size; @@ -1721,18 +1721,18 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D( size_t in_c_dtype_size = mluOpDataTypeBytes(in_c_dtype); int batch = fft_plan->batch; - const int _n0 = fft_plan->n[0]; - const int _n1 = fft_plan->n[1]; + const int n0_ori = fft_plan->n[0]; + const int n1_ori = fft_plan->n[1]; - size_t buffer_size = batch * in_c_dtype_size * _n0 * _n1; + size_t buffer_size = batch * in_c_dtype_size * n0_ori * n1_ori; - size_t twiddles_size = in_c_dtype_size * _n0; - size_t twiddles_size_2d = in_c_dtype_size * _n1; + size_t twiddles_size = in_c_dtype_size * n0_ori; + size_t twiddles_size_2d = in_c_dtype_size * n1_ori; if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { - reservespace_size = - (in_c_dtype_size * _n0 * _n0 + in_c_dtype_size * _n1 * _n1) * - 2; /* DFT matrix */ + reservespace_size = (in_c_dtype_size * n0_ori * n0_ori + + in_c_dtype_size * n1_ori * n1_ori) * + 2; /* DFT matrix */ workspace_size = buffer_size * 6; } else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) { reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */ @@ -1740,13 +1740,17 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateC2C2D( DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 + DFT_TABLE_SIZE * 2; /* twiddles */ workspace_size = buffer_size * 2; - workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size; + workspace_size += (fft_plan->is_input_contiguous && + fft_plan->inembed[0] <= fft_plan->n[0] && + fft_plan->inembed[1] <= fft_plan->n[1]) + ? 0 + : buffer_size; workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size; } fft_plan->workspace_size = workspace_size; - if (fft_plan->n[0] > fft_plan->inembed[0] || - fft_plan->n[1] > fft_plan->inembed[1]) { + if (fft_plan->n[0] != fft_plan->inembed[0] || + fft_plan->n[1] != fft_plan->inembed[1]) { fft_plan->workspace_size = workspace_size + buffer_size; // input_pad_addr } fft_plan->reservespace_size = reservespace_size; @@ -1783,7 +1787,7 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */ + twiddles_size * 2 + DFT_TABLE_SIZE * 2; /* twiddles */ - if (fft_plan->n[0] > fft_plan->inembed[0]) { + if (fft_plan->n[0] != fft_plan->inembed[0]) { workspace_size += buffer_size; // input_pad_addr } fft_plan->workspace_size = workspace_size; @@ -1791,11 +1795,58 @@ mluOpAllocateC2R1D(mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, return MLUOP_STATUS_SUCCESS; } +mluOpStatus_t MLUOP_WIN_API mluOpAllocateIRFFT2D( + mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, + mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc, + const int n0_ori, const int n1_ori) { + const std::string make_plan_api = "[mluOpAllocateIRFFT2D]"; + size_t workspace_size = 0, reservespace_size = 0; + + mluOpDataType_t out_c_dtype = fft_plan->output_dtype; + mluOpDataType_t in_c_dtype = fft_plan->input_dtype; + size_t complex_dtype_size = + (mluOpDataTypeBytes(out_c_dtype) > mluOpDataTypeBytes(in_c_dtype)) + ? mluOpDataTypeBytes(out_c_dtype) + : mluOpDataTypeBytes(in_c_dtype); + + int batch = fft_plan->batch; + size_t buffer_size = batch * complex_dtype_size * n0_ori * n1_ori; + + size_t twiddles_size = complex_dtype_size * n0_ori; + size_t twiddles_size_2d = complex_dtype_size * n1_ori; + + if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { + reservespace_size = + complex_dtype_size * n0_ori * n0_ori * 2 + + complex_dtype_size * n1_ori * n1_ori * 2; /* DFT matrix */ + workspace_size = complex_dtype_size * n1_ori * n0_ori * batch * 6; + } else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) { + reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */ + + sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 + + DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 + + DFT_TABLE_SIZE * 2; /* twiddles */ + workspace_size = buffer_size * 2; + workspace_size += (fft_plan->is_input_contiguous && + fft_plan->inembed[0] <= fft_plan->n[0] && + fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1) + ? 0 + : buffer_size; + workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size; + } + + if (fft_plan->n[0] != fft_plan->inembed[0] || + fft_plan->n[1] != fft_plan->inembed[1]) { + workspace_size += buffer_size; + } + fft_plan->workspace_size = workspace_size; + fft_plan->reservespace_size = reservespace_size; + return MLUOP_STATUS_SUCCESS; +} mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D( mluOpHandle_t handle, mluOpFFTPlan_t fft_plan, mluOpTensorDescriptor_t input_desc, mluOpTensorDescriptor_t output_desc, - const int _n0, const int _n1) { + const int n0_ori, const int n1_ori) { const std::string make_plan_api = "[mluOpAllocateRFFT2D]"; size_t workspace_size = 0, reservespace_size = 0; @@ -1807,27 +1858,32 @@ mluOpStatus_t MLUOP_WIN_API mluOpAllocateRFFT2D( : mluOpDataTypeBytes(in_c_dtype); int batch = fft_plan->batch; - size_t buffer_size = batch * complex_dtype_size * _n0 * _n1; + size_t buffer_size = batch * complex_dtype_size * n0_ori * n1_ori; - size_t twiddles_size = complex_dtype_size * _n0; - size_t twiddles_size_2d = complex_dtype_size * _n1; + size_t twiddles_size = complex_dtype_size * n0_ori; + size_t twiddles_size_2d = complex_dtype_size * n1_ori; if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { - reservespace_size = complex_dtype_size * _n0 * _n0 * 2 + - complex_dtype_size * _n1 * _n1 * 2; /* DFT matrix */ - workspace_size = complex_dtype_size * _n1 * _n0 * batch * 6; + reservespace_size = + complex_dtype_size * n0_ori * n0_ori * 2 + + complex_dtype_size * n1_ori * n1_ori * 2; /* DFT matrix */ + workspace_size = complex_dtype_size * n1_ori * n0_ori * batch * 6; } else if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) { reservespace_size = sizeof(int) * (FFT_MAXFACTORS) /* factors */ + sizeof(int) * (FFT_MAXFACTORS) + twiddles_size * 2 + DFT_TABLE_SIZE * 2 + twiddles_size_2d * 2 + DFT_TABLE_SIZE * 2; /* twiddles */ workspace_size = buffer_size * 2; - workspace_size += (fft_plan->is_input_contiguous) ? 0 : buffer_size; + workspace_size += (fft_plan->is_input_contiguous && + fft_plan->inembed[0] <= fft_plan->n[0] && + fft_plan->inembed[1] <= fft_plan->n[1]) + ? 0 + : buffer_size; workspace_size += (fft_plan->is_output_contiguous) ? 0 : buffer_size; } - if (fft_plan->n[0] > fft_plan->inembed[0] || - fft_plan->n[1] > fft_plan->inembed[1]) { + if (fft_plan->n[0] != fft_plan->inembed[0] || + fft_plan->n[1] != fft_plan->inembed[1]) { workspace_size += buffer_size; } fft_plan->workspace_size = workspace_size; @@ -1846,6 +1902,8 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2C1D( const int rank, const int *n) { fft_plan->is_batch_contiguous = (fft_plan->idist == 1 && fft_plan->odist == 1 && + fft_plan->inembed[0] == fft_plan->n[0] && + fft_plan->onembed[0] == fft_plan->n[0] && fft_plan->istride == fft_plan->batch && fft_plan->ostride == fft_plan->batch) && (fft_plan->n[0] == fft_plan->inembed[0]); @@ -2221,7 +2279,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanC2R2D( fft_plan->fft_strategy = CNFFT_FUNC_TWO_LEVEL_STOCKHAM; } - mluOpAllocateRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]); + mluOpAllocateIRFFT2D(handle, fft_plan, input_desc, output_desc, n[0], n[1]); if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { switch (fft_plan->fft_type) { @@ -2394,6 +2452,12 @@ mluOpStatus_t MLUOP_WIN_API mluOpMakeFFTPlanMany( fft_plan->inembed[i] = input_desc->dims[fft_plan->idim - rank + i]; fft_plan->onembed[i] = output_desc->dims[fft_plan->odim - rank + i]; } + for (auto i = 0; i < fft_plan->idim; i++) { + fft_plan->in_stride[i] = input_desc->strides[i]; + } + for (auto i = 0; i < fft_plan->odim; i++) { + fft_plan->out_stride[i] = output_desc->strides[i]; + } if (fft_plan->idim == rank + 1) { fft_plan->idist = input_desc->strides[0]; fft_plan->odist = output_desc->strides[0]; diff --git a/kernels/fft/fft.h b/kernels/fft/fft.h index aa7ac0ba6..6f31a7751 100644 --- a/kernels/fft/fft.h +++ b/kernels/fft/fft.h @@ -180,6 +180,8 @@ struct cnfftButterflyAddrs { int *factors; int *factors_2d; void *input_pad_addr; + void *input_copy_workspace_addr; + void *output_copy_workspace_addr; }; struct mluOpFFTStruct { int rank; // rank of FFT @@ -193,24 +195,26 @@ struct mluOpFFTStruct { int inum; // element num of input tensor int istride; // distance between two successive input elements in the // innermost dimension - int idist; // distance between the first element of two consecutive signals - // in a batch of the input data - int odim; // the dimension size of output tensor + int in_stride[FFT_DIM_MAX + 1]; + int idist; // distance between the first element of two consecutive signals + // in a batch of the input data + int odim; // the dimension size of output tensor int onembed[FFT_DIM_MAX]; // Pointer of size rank that indicates the storage // dimensions of the output data in memory int onum; // element num of output tensor int ostride; // distance between two successive output elements in the // innermost dimension - int odist; // distance between the first element of two consecutive signals - // in a batch of the output data - int batch; // batch size for this transform - int L; // n = L * 2^m, L size for this transform - int m; // n = L * 2^m, m size for this transform - int s; // The size that can be put down on NRAM: L * 2^s, only used by - // Cooley-Tukey algorithm - int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used - // by Stockham algorithm - int prime; // wether fft1d'size contains a prime number > 64 + int out_stride[FFT_DIM_MAX + 1]; + int odist; // distance between the first element of two consecutive signals + // in a batch of the output data + int batch; // batch size for this transform + int L; // n = L * 2^m, L size for this transform + int m; // n = L * 2^m, m size for this transform + int s; // The size that can be put down on NRAM: L * 2^s, only used by + // Cooley-Tukey algorithm + int L_sub; // The size that can be put down on NRAM: L_sub * 2^m, only used + // by Stockham algorithm + int prime; // wether fft1d'size contains a prime number > 64 bool is_input_contiguous; bool is_output_contiguous; bool is_batch_contiguous; diff --git a/kernels/fft/fft_optm_device/fft_c2c_stockham_nram.h b/kernels/fft/fft_optm_device/fft_c2c_stockham_nram.h index 547174631..07d31dea1 100644 --- a/kernels/fft/fft_optm_device/fft_c2c_stockham_nram.h +++ b/kernels/fft/fft_optm_device/fft_c2c_stockham_nram.h @@ -305,379 +305,6 @@ __mlu_func__ void computeLargeButterflyFirststageBatchPingpong( } } -// Compute the large butterfly for the subsequent stages of the FFT -template -__mlu_func__ void computeLargeButterflyOtherstages( - DT *output, DT *input, const int large_radix, const DT *cur_large_twiddles, - const DT *_twiddles, const DT *dft_matrix, const int large_section_num, - const int large_butterfly_num, const int large_in_stride, void *nram_buf, - const int *small_factors, const int nfft, const int dir, - const int last_stage) { - const dft_table_entry *dft_table = (const dft_table_entry *)dft_matrix; - const int K_num = 64 / sizeof(DT); - int align_K = 0; - int radix, small_in_stride, small_stage_count, _small_stage_count; - int small_section_num, small_butterfly_num, value_mul; - - const int large_out_stride = large_butterfly_num; - int tw_offset; - - _small_stage_count = small_factors[0]; - tw_offset = small_factors[1]; - - const DT *small_twiddles = _twiddles + tw_offset * 2; - - const int max_para_ldst_num = (4096 + large_radix - 1) / large_radix; - - int nram_buf_offset = 0; - DT *nram_in_r = (DT *)nram_buf + nram_buf_offset; - nram_buf_offset += large_radix * max_para_ldst_num; - - DT *nram_in_i = (DT *)nram_buf + nram_buf_offset; - nram_buf_offset += large_radix * max_para_ldst_num; - - DT *nram_out_r = (DT *)nram_buf + nram_buf_offset; - nram_buf_offset += large_radix * max_para_ldst_num; - - DT *nram_out_i = (DT *)nram_buf + nram_buf_offset; - nram_buf_offset += large_radix * max_para_ldst_num; - - FFT_CPX_T
nram_para_load_in_ping = { - (DT *)nram_buf + nram_buf_offset, - (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num}; - nram_buf_offset += large_radix * max_para_ldst_num * 2; - - FFT_CPX_T
nram_para_load_in_pong = { - (DT *)nram_buf + nram_buf_offset, - (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num}; - nram_buf_offset += large_radix * max_para_ldst_num * 2; - - FFT_CPX_T
nram_para_load_tw_ping = { - (DT *)nram_buf + nram_buf_offset, - (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num}; - nram_buf_offset += large_radix * max_para_ldst_num * 2; - - FFT_CPX_T
nram_para_load_tw_pong = { - (DT *)nram_buf + nram_buf_offset, - (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num}; - nram_buf_offset += large_radix * max_para_ldst_num * 2; - - FFT_CPX_T
nram_para_store_ping = { - (DT *)nram_buf + nram_buf_offset, - (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num}; - nram_buf_offset += large_radix * max_para_ldst_num * 2; - - FFT_CPX_T
nram_para_store_pong = { - (DT *)nram_buf + nram_buf_offset, - (DT *)nram_buf + nram_buf_offset + large_radix * max_para_ldst_num}; - nram_buf_offset += large_radix * max_para_ldst_num * 2; - - FFT_CPX_T
nram_transpose_temp; - nram_transpose_temp = { - (DT *)nram_in_r, - (DT *)nram_in_r + large_radix * ((int)last_stage) + - large_radix * (1 - (int)last_stage) * max_para_ldst_num}; - - DT *_nram_tw = (DT *)nram_buf + nram_buf_offset; - nram_buf_offset += large_radix * 2; - - int ld_dft_radix = -1; - const int max_radix = 64; - DT *nram_dftmtx = (DT *)nram_buf + nram_buf_offset; - nram_buf_offset += max_radix * max_radix * 2; - - DT *nram_scratch = (DT *)nram_buf + nram_buf_offset; - - DT *CPX_MUL_RR = nram_scratch; - DT *CPX_MUL_RI = &CPX_MUL_RR[large_radix * max_para_ldst_num]; - DT *CPX_MUL_IR = &CPX_MUL_RI[large_radix * max_para_ldst_num]; - DT *CPX_MUL_II = &CPX_MUL_IR[large_radix * max_para_ldst_num]; - - nram_buf_offset += large_radix * max_para_ldst_num * 4; - - int Fin_stride = 0, Fout_stride = 0; - int sec_count; - int repeat_num = - (large_butterfly_num + max_para_ldst_num - 1) / max_para_ldst_num; - for (sec_count = 0; sec_count < large_section_num; ++sec_count) { - for (int repeat_id = 0; repeat_id < repeat_num + 2; ++repeat_id) { - if (repeat_id < repeat_num) { - int i = max_para_ldst_num * repeat_id; - FFT_CPX_T
nram_para_load_in = (repeat_id % 2 == 0) - ? nram_para_load_in_ping - : nram_para_load_in_pong; - - FFT_CPX_T
nram_para_load_tw = (repeat_id % 2 == 0) - ? nram_para_load_tw_ping - : nram_para_load_tw_pong; - - int para_load_num = (max_para_ldst_num > (large_butterfly_num - i)) - ? (large_butterfly_num - i) - : max_para_ldst_num; - - __memcpy_async(nram_para_load_in.r, input + Fin_stride + i, - sizeof(DT) * para_load_num, GDRAM2NRAM, - sizeof(DT) * para_load_num, large_in_stride * sizeof(DT), - large_radix - 1); - __memcpy_async(nram_para_load_in.i, input + nfft + Fin_stride + i, - sizeof(DT) * para_load_num, GDRAM2NRAM, - sizeof(DT) * para_load_num, large_in_stride * sizeof(DT), - large_radix - 1); - __memcpy_async(nram_para_load_tw.r, cur_large_twiddles + i, - sizeof(DT) * para_load_num, SRAM2NRAM, - sizeof(DT) * para_load_num, - large_out_stride * sizeof(DT), large_radix - 2); - __memcpy_async( - nram_para_load_tw.i, - cur_large_twiddles + large_butterfly_num * (large_radix - 1) + i, - sizeof(DT) * para_load_num, SRAM2NRAM, sizeof(DT) * para_load_num, - large_out_stride * sizeof(DT), large_radix - 2); - } - - if (repeat_id >= 2) { - int i = max_para_ldst_num * (repeat_id - 2); - - int para_store_num = (max_para_ldst_num > (large_butterfly_num - i)) - ? (large_butterfly_num - i) - : max_para_ldst_num; - - FFT_CPX_T
nram_para_store = - (repeat_id % 2 == 0) ? nram_para_store_ping : nram_para_store_pong; - - if (last_stage) { - __memcpy_async(output + (Fout_stride + i) * 2, nram_para_store.r, - sizeof(DT) * 2 * para_store_num, NRAM2GDRAM, - large_out_stride * 2 * sizeof(DT), - sizeof(DT) * 2 * para_store_num, large_radix - 1); - } else { - __memcpy_async(output + Fout_stride + i, nram_para_store.r, - para_store_num * sizeof(DT), NRAM2GDRAM, - large_out_stride * sizeof(DT), - sizeof(DT) * para_store_num, large_radix - 1); - __memcpy_async(output + Fout_stride + i + nfft, nram_para_store.i, - para_store_num * sizeof(DT), NRAM2GDRAM, - large_out_stride * sizeof(DT), - sizeof(DT) * para_store_num, large_radix - 1); - } - } - - if (repeat_id >= 1 && repeat_id < repeat_num + 1) { - int i = max_para_ldst_num * (repeat_id - 1); - - FFT_CPX_T
nram_para_load_in = (repeat_id % 2 != 0) - ? nram_para_load_in_ping - : nram_para_load_in_pong; - - FFT_CPX_T
nram_para_load_tw = (repeat_id % 2 != 0) - ? nram_para_load_tw_ping - : nram_para_load_tw_pong; - - FFT_CPX_T
nram_para_store = - (repeat_id % 2 != 0) ? nram_para_store_ping : nram_para_store_pong; - - int para_ldst_num = (max_para_ldst_num > (large_butterfly_num - i)) - ? (large_butterfly_num - i) - : max_para_ldst_num; - - __bang_mul(CPX_MUL_RR, nram_para_load_in.r + para_ldst_num, - nram_para_load_tw.r, para_ldst_num * (large_radix - 1)); - __bang_mul(CPX_MUL_II, nram_para_load_in.i + para_ldst_num, - nram_para_load_tw.i, para_ldst_num * (large_radix - 1)); - __bang_mul(CPX_MUL_RI, nram_para_load_in.r + para_ldst_num, - nram_para_load_tw.i, para_ldst_num * (large_radix - 1)); - __bang_mul(CPX_MUL_IR, nram_para_load_in.i + para_ldst_num, - nram_para_load_tw.r, para_ldst_num * (large_radix - 1)); - - __bang_sub(nram_para_load_in.r + para_ldst_num, CPX_MUL_RR, CPX_MUL_II, - para_ldst_num * (large_radix - 1)); - __bang_add(nram_para_load_in.i + para_ldst_num, CPX_MUL_RI, CPX_MUL_IR, - para_ldst_num * (large_radix - 1)); - - { - radix = small_factors[4]; - small_section_num = small_factors[5]; - small_in_stride = small_factors[7]; - small_stage_count = _small_stage_count; - - if (ld_dft_radix != radix) { - ld_dft_radix = radix; - for (int entry = 0;; entry++) { - if (dft_table[entry].radix == ld_dft_radix) { - align_K = K_num * ((radix + K_num - 1) / K_num); - __memcpy_async( - nram_dftmtx, &dft_matrix[dft_table[entry].offset * 2], - sizeof(DT) * 2 * ld_dft_radix * align_K, SRAM2NRAM); - __sync_move(); - break; - } - - if (dft_table[entry].radix == -1) { - break; - } - } - } - - computeGenericButterflyFirststageMat( - nram_out_r, nram_out_i, nram_para_load_in.r, nram_para_load_in.i, - nram_scratch, nram_dftmtx, small_section_num * para_ldst_num, - small_section_num * para_ldst_num, 1, dir, radix); - - small_stage_count--; - if (small_stage_count == 0) { - if (last_stage) { - __memcpy_async(nram_transpose_temp.r, nram_out_r, - sizeof(DT) * large_radix, NRAM2NRAM, - sizeof(DT) * large_radix * 2, - sizeof(DT) * large_radix, para_ldst_num - 1); - - __memcpy_async(nram_transpose_temp.i, nram_out_i, - sizeof(DT) * large_radix, NRAM2NRAM, - sizeof(DT) * large_radix * 2, - sizeof(DT) * large_radix, para_ldst_num - 1); - __sync_move(); - - __bang_transpose(nram_para_store.r, nram_transpose_temp.r, - para_ldst_num * 2, large_radix); - } else { - __bang_transpose(nram_para_store.r, nram_out_r, para_ldst_num, - large_radix); - __bang_transpose(nram_para_store.i, nram_out_i, para_ldst_num, - large_radix); - } - - } else { - FFT_SWAP_PTR(nram_out_r, nram_in_r); - FFT_SWAP_PTR(nram_out_i, nram_in_i); - TRANSPOSE_XYZ2YXZ_PAIR(nram_out_r, nram_out_i, nram_in_r, nram_in_i, - small_section_num, para_ldst_num, radix, DT) - DT *nram_tw = _nram_tw; - value_mul = 8; - - for (; small_stage_count > 1; small_stage_count--) { - FFT_SWAP_PTR(nram_out_r, nram_in_r); - FFT_SWAP_PTR(nram_out_i, nram_in_i); - - radix = small_factors[value_mul++]; - small_section_num = small_factors[value_mul++]; - small_butterfly_num = small_factors[value_mul++]; - small_in_stride = small_factors[value_mul++]; - - if (ld_dft_radix != radix) { - ld_dft_radix = radix; - for (int entry = 0;; entry++) { - if (dft_table[entry].radix == ld_dft_radix) { - align_K = K_num * ((radix + K_num - 1) / K_num); - __memcpy_async( - nram_dftmtx, &dft_matrix[dft_table[entry].offset * 2], - sizeof(DT) * 2 * ld_dft_radix * align_K, SRAM2NRAM); - __sync_move(); - break; - } - - if (dft_table[entry].radix == -1) { - break; - } - } - } - - if (sec_count == 0 && repeat_id == 1) { - __memcpy(nram_tw, small_twiddles, - small_butterfly_num * (radix - 1) * sizeof(DT) * 2, - SRAM2NRAM); - small_twiddles += small_butterfly_num * (radix - 1) * 2; - } - - computeGenericButterflyOtherstagesMat( - nram_out_r, nram_out_i, nram_in_r, nram_in_i, nram_scratch, - nram_dftmtx, nram_tw, small_section_num, small_butterfly_num, - para_ldst_num, small_in_stride, dir, radix); - - nram_tw += small_butterfly_num * (radix - 1) * 2; - } - - { - FFT_SWAP_PTR(nram_out_r, nram_in_r); - FFT_SWAP_PTR(nram_out_i, nram_in_i); - - radix = small_factors[value_mul++]; - small_section_num = small_factors[value_mul++]; - small_butterfly_num = small_factors[value_mul++]; - small_in_stride = small_factors[value_mul]; - - if (sec_count == 0 && repeat_id == 1) { - __memcpy_async( - nram_tw, small_twiddles, - small_butterfly_num * (radix - 1) * sizeof(DT) * 2, - SRAM2NRAM); - __sync_move(); - } - - if (ld_dft_radix != radix) { - ld_dft_radix = radix; - for (int entry = 0;; entry++) { - if (dft_table[entry].radix == ld_dft_radix) { - align_K = K_num * ((radix + K_num - 1) / K_num); - __memcpy_async( - nram_dftmtx, &dft_matrix[dft_table[entry].offset * 2], - sizeof(DT) * 2 * ld_dft_radix * align_K, SRAM2NRAM); - __sync_move(); - break; - } - - if (dft_table[entry].radix == -1) { - break; - } - } - } - computeGenericButterflyLaststageMat( - nram_out_r, nram_out_i, nram_in_r, nram_in_i, nram_scratch, - nram_dftmtx, nram_tw, small_section_num, small_butterfly_num, - para_ldst_num, small_in_stride, dir, radix); - - if (last_stage) { - __memcpy_async(nram_transpose_temp.r, nram_out_r, - sizeof(DT) * large_radix, NRAM2NRAM, - sizeof(DT) * large_radix * 2, - sizeof(DT) * large_radix, para_ldst_num - 1); - - __memcpy_async(nram_transpose_temp.i, nram_out_i, - sizeof(DT) * large_radix, NRAM2NRAM, - sizeof(DT) * large_radix * 2, - sizeof(DT) * large_radix, para_ldst_num - 1); - __sync_move(); - - __bang_transpose(nram_para_store.r, nram_transpose_temp.r, - para_ldst_num * 2, large_radix); - } else { - __bang_transpose(nram_para_store.r, nram_out_r, para_ldst_num, - large_radix); - __bang_transpose(nram_para_store.i, nram_out_i, para_ldst_num, - large_radix); - } - } - } - } - } - - __sync(); - } - Fin_stride += large_butterfly_num; - Fout_stride += large_radix * large_butterfly_num; - } -} - -template -__mlu_func__ void computeLargeButterflyLaststage( - DT *output, DT *input, const int large_radix, const DT *cur_large_twiddles, - const DT *_twiddles, const DT *dft_matrix, const int large_section_num, - const int large_butterfly_num, const int large_in_stride, void *nram_buf, - const int *small_factors, const int nfft, const int dir) { - computeLargeButterflyOtherstages( - output, input, large_radix, cur_large_twiddles, _twiddles, dft_matrix, - large_section_num, large_butterfly_num, large_in_stride, nram_buf, - small_factors, nfft, dir, 1); -} - // Compute the large butterfly for the last stage of the FFT template __mlu_func__ void computeLargeButterflyOtherstagesBatchPingpong( diff --git a/kernels/fft/fft_optm_device/fft_two-level_network_c2c_device.mlu b/kernels/fft/fft_optm_device/fft_two-level_network_c2c_device.mlu index 55b3a37b6..808c91df1 100644 --- a/kernels/fft/fft_optm_device/fft_two-level_network_c2c_device.mlu +++ b/kernels/fft/fft_optm_device/fft_two-level_network_c2c_device.mlu @@ -35,26 +35,10 @@ __mlu_global__ void MLUKernelFFT1dButterflyRow( void *input, void *output, int *factors, void *twiddles, void *twiddles_end, void *dft_matrix, void *buffer, const int batch, const int fft_flag, const int direction, const int dtype_size) { - switch (dtype_size) { - case (MLUOP_DTYPE_COMPLEX_FLOAT): - case (MLUOP_DTYPE_FLOAT): { - computeMutiStageOnchip((float *)input, (float *)output, factors, - (float *)twiddles, (float *)twiddles_end, - (float *)dft_matrix, (float *)buffer, batch, - fft_flag, direction); - }; break; - case (MLUOP_DTYPE_COMPLEX_HALF): - case (MLUOP_DTYPE_HALF): { - computeMutiStageOnchip((half *)input, (half *)output, factors, - (half *)twiddles, (half *)twiddles_end, - (half *)dft_matrix, (half *)buffer, batch, - fft_flag, direction); - }; break; - - default: { - MLULOG("mluOpFFT Not Implemented."); - } - } + computeMutiStageOnchip((float *)input, (float *)output, factors, + (float *)twiddles, (float *)twiddles_end, + (float *)dft_matrix, (float *)buffer, batch, + fft_flag, direction); } // Kernel function for 1D FFT butterfly operations on columns. @@ -62,26 +46,10 @@ __mlu_global__ void MLUKernelFFT1dButterflyColumn( void *input, void *output, int *factors, void *twiddles, void *twiddles_end, void *dft_matrix, void *buffer, const int batch, const int fft_flag, const int direction, const int dtype_size, const int nb) { - switch (dtype_size) { - case (MLUOP_DTYPE_COMPLEX_FLOAT): - case (MLUOP_DTYPE_FLOAT): { - computeMutiStageOnchipColumn( - (float *)input, (float *)output, factors, (float *)twiddles, - (float *)twiddles_end, (float *)dft_matrix, (float *)buffer, batch, - fft_flag, direction, nb); - }; break; - case (MLUOP_DTYPE_COMPLEX_HALF): - case (MLUOP_DTYPE_HALF): { - computeMutiStageOnchipColumn((half *)input, (half *)output, factors, - (half *)twiddles, (half *)twiddles_end, - (half *)dft_matrix, (half *)buffer, - batch, fft_flag, direction, nb); - }; break; - - default: { - MLULOG("mluOpFFT Not Implemented."); - } - } + computeMutiStageOnchipColumn((float *)input, (float *)output, factors, + (float *)twiddles, (float *)twiddles_end, + (float *)dft_matrix, (float *)buffer, + batch, fft_flag, direction, nb); } // Launches a kernel for 2D FFT butterfly operations on columns. diff --git a/kernels/fft/fft_optm_device/fft_two-level_network_c2r_device.mlu b/kernels/fft/fft_optm_device/fft_two-level_network_c2r_device.mlu index 31b3c3908..a76078f62 100644 --- a/kernels/fft/fft_optm_device/fft_two-level_network_c2r_device.mlu +++ b/kernels/fft/fft_optm_device/fft_two-level_network_c2r_device.mlu @@ -33,26 +33,10 @@ __mlu_global__ void MLUKernelFFT1dButterflyRowC2R( void *input, void *output, int *factors, void *twiddles, void *twiddles_end, void *dft_matrix, void *buffer, int batch, int fft_flag, int dtype_size) { - switch (dtype_size) { - case (MLUOP_DTYPE_COMPLEX_FLOAT): - case (MLUOP_DTYPE_FLOAT): { - computeMutiStageOnchipC2R((float *)input, (float *)output, factors, - (float *)twiddles, (float *)twiddles_end, - (float *)dft_matrix, (float *)buffer, - batch, fft_flag); - }; break; - case (MLUOP_DTYPE_COMPLEX_HALF): - case (MLUOP_DTYPE_HALF): { - computeMutiStageOnchipC2R((half *)input, (half *)output, factors, - (half *)twiddles, (half *)twiddles_end, - (half *)dft_matrix, (half *)buffer, batch, - fft_flag); - }; break; - - default: { - MLULOG("mluOpFFT Not Implemented."); - } - } + computeMutiStageOnchipC2R((float *)input, (float *)output, factors, + (float *)twiddles, (float *)twiddles_end, + (float *)dft_matrix, (float *)buffer, batch, + fft_flag); } mluOpStatus_t MLUOP_WIN_API kernelFFT1dButterflyRowC2R( diff --git a/kernels/fft/fft_optm_device/fft_two-level_network_r2c_device.mlu b/kernels/fft/fft_optm_device/fft_two-level_network_r2c_device.mlu index 5dd5f9e8d..3e36946b7 100644 --- a/kernels/fft/fft_optm_device/fft_two-level_network_r2c_device.mlu +++ b/kernels/fft/fft_optm_device/fft_two-level_network_r2c_device.mlu @@ -33,28 +33,10 @@ __mlu_global__ void MLUKernelFFT1dButterflyR2C( void *input, void *output, int *factors, void *twiddles, void *twiddles_end, void *dft_matrix, void *buffer, int batch, int fft_flag, int dtype_size) { - switch (dtype_size) { - case (MLUOP_DTYPE_COMPLEX_FLOAT): - case (MLUOP_DTYPE_FLOAT): { - MLULOG("MLUOP_DTYPE_COMPLEX_FLOAT: MLUOP_DTYPE_FLOAT\n"); - computeMutiStageR2COnchip((float *)input, (float *)output, factors, - (float *)twiddles, (float *)twiddles_end, - (float *)dft_matrix, (float *)buffer, - batch, fft_flag); - }; break; - case (MLUOP_DTYPE_COMPLEX_HALF): - case (MLUOP_DTYPE_HALF): { - MLULOG("MLUOP_DTYPE_COMPLEX_HALF: MLUOP_DTYPE_HALF\n"); - computeMutiStageR2COnchip((half *)input, (half *)output, factors, - (half *)twiddles, (half *)twiddles_end, - (half *)dft_matrix, (half *)buffer, batch, - fft_flag); - }; break; - - default: { - MLULOG("mluOpFFT Not Implemented."); - } - } + computeMutiStageR2COnchip((float *)input, (float *)output, factors, + (float *)twiddles, (float *)twiddles_end, + (float *)dft_matrix, (float *)buffer, batch, + fft_flag); } mluOpStatus_t MLUOP_WIN_API kernelFFT1dButterflyR2C(cnrtDim3_t k_dim, diff --git a/kernels/fft/irfft/irfft_host.cpp b/kernels/fft/irfft/irfft_host.cpp index b065028e1..5d2543ad7 100644 --- a/kernels/fft/irfft/irfft_host.cpp +++ b/kernels/fft/irfft/irfft_host.cpp @@ -795,14 +795,14 @@ static mluOpStatus_t makeIRFFT1dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 2; - int64_t dims[in_dim_num] = { + const int IN_DIM_NUM = 2; + int64_t dims[IN_DIM_NUM] = { fft_plan->batch, fft_plan->prime ? fft_plan->inembed[0] : std::min(fft_plan->inembed[0], FFT_HALF(fft_plan->n[0]))}; - int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride}; + int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride}; status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->input_dtype, in_dim_num, + fft_plan->input_dtype, IN_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); @@ -840,15 +840,15 @@ static mluOpStatus_t padIRFFT1dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&padded_input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 2; - int64_t dims[in_dim_num] = {batch, fft_plan->inembed[0] * COMPLEX}; + const int IN_DIM_NUM = 2; + int64_t dims[IN_DIM_NUM] = {batch, fft_plan->inembed[0] * COMPLEX}; status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, dims); + in_r_dtype, IN_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - int64_t padded_dims[in_dim_num] = {batch, FFT_HALF(n) * COMPLEX}; + int64_t padded_dims[IN_DIM_NUM] = {batch, FFT_HALF(n) * COMPLEX}; status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, padded_dims); + in_r_dtype, IN_DIM_NUM, padded_dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); const int pad_dim_num = 4; @@ -908,17 +908,17 @@ static mluOpStatus_t padIRFFT2dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&padded_input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 3; - int64_t dims[in_dim_num] = { + const int IN_DIM_NUM = 3; + int64_t dims[IN_DIM_NUM] = { batch, std::min(fft_plan->inembed[0], n0), std::min(fft_plan->inembed[1], FFT_HALF(n1)) * COMPLEX}; status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, dims); + in_r_dtype, IN_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - int64_t padded_dims[in_dim_num] = {batch, n0, FFT_HALF(n1) * COMPLEX}; + int64_t padded_dims[IN_DIM_NUM] = {batch, n0, FFT_HALF(n1) * COMPLEX}; status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, padded_dims); + in_r_dtype, IN_DIM_NUM, padded_dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); const int pad_dim_num = 6; @@ -1461,17 +1461,17 @@ static mluOpStatus_t makeIRFFT1dContiguousOutput(mluOpHandle_t handle, INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // set up tensor desc - const int out_dim_num = 2; - int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->prime + const int OUT_DIM_NUM = 2; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->prime ? fft_plan->onembed[0] : fft_plan->n[0]}; - int64_t strides[out_dim_num] = {fft_plan->odist, fft_plan->ostride}; + int64_t strides[OUT_DIM_NUM] = {fft_plan->odist, fft_plan->ostride}; status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY, - out_r_dtype, out_dim_num, dims); + out_r_dtype, OUT_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY, - out_r_dtype, out_dim_num, dims, strides); + out_r_dtype, OUT_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // copy @@ -1486,17 +1486,8 @@ static mluOpStatus_t makeIRFFT1dContiguousOutput(mluOpHandle_t handle, DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc, cnnl_copy_dst_desc); - size_t workspace_size = 0; - CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc, - cnnl_copy_dst_desc, &workspace_size)); - - void *workspace = nullptr; - if (workspace_size > 0) { - CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); - } CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr, - cnnl_copy_dst_desc, output, workspace, - workspace_size)); + cnnl_copy_dst_desc, output, NULL, 0)); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc); @@ -1567,12 +1558,16 @@ mluOpStatus_t execIRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, const float beta = 0.0; mluOpTensorDescriptor_t c_desc = nullptr; status = mluOpCreateTensorDescriptor(&c_desc); - const int out_dim_num = 2; - int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0]}; + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + const int OUT_DIM_NUM = 2; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0]}; status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->output_dtype, 2, dims); + fft_plan->output_dtype, OUT_DIM_NUM, + dims); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorOnchipDataType( c_desc, fft_plan->execution_dtype); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); // convert to cnnl_handle @@ -1583,6 +1578,8 @@ mluOpStatus_t execIRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, cnnl_output_desc, fft_plan->mlu_addrs.output, &beta, cnnl_output_desc, fft_plan->mlu_addrs.output)); + status = mluOpDestroyTensorDescriptor(c_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); DESTROY_CNNL_HANDLE(cnnl_handle); } @@ -1606,13 +1603,13 @@ static void configureIRFFT2dWorkspaceAddrs(mluOpHandle_t handle, size_t out_c_dtype_size = mluOpDataTypeBytes(out_c_dtype); int batch = fft_plan->batch; - int _n0 = fft_plan->n[0]; - int _n1 = fft_plan->n[1]; + int n0_ori = fft_plan->n[0]; + int n1_ori = fft_plan->n[1]; size_t offset = 0; if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { // rr ri ir ii - size_t buffer_size = batch * in_c_dtype_size * _n0 * _n1 * 2; + size_t buffer_size = batch * in_c_dtype_size * n0_ori * n1_ori * 2; offset = 0; fft_plan->mlu_addrs.input = input; fft_plan->mlu_addrs.output = output; @@ -1625,25 +1622,28 @@ static void configureIRFFT2dWorkspaceAddrs(mluOpHandle_t handle, if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) { offset = 0; fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset; - offset += batch * in_c_dtype_size * _n0 * _n1 * 2; + offset += batch * in_c_dtype_size * n0_ori * n1_ori * 2; - if (fft_plan->is_input_contiguous) { + if (fft_plan->is_input_contiguous && + fft_plan->inembed[0] <= fft_plan->n[0] && + fft_plan->inembed[1] <= fft_plan->n[1] / 2 + 1 || + fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { fft_plan->mlu_addrs.input = input; } else { fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset; - offset += batch * in_c_dtype_size * _n0 * _n1; + offset += batch * in_c_dtype_size * n0_ori * n1_ori; } if (fft_plan->is_output_contiguous) { fft_plan->mlu_addrs.output = output; } else { fft_plan->mlu_addrs.output = (uint8_t *)workspace + offset; - offset += batch * in_c_dtype_size * _n0 * _n1; + offset += batch * in_c_dtype_size * n0_ori * n1_ori; } } if (fft_plan->n[0] > fft_plan->inembed[0] || - fft_plan->n[1] > fft_plan->inembed[1]) { + fft_plan->n[1] / 2 + 1 > fft_plan->inembed[1]) { fft_plan->mlu_addrs.input_pad_addr = (uint8_t *)workspace + offset; } } @@ -1828,11 +1828,11 @@ mluOpStatus_t computeFFT2dMatMulRowC2R(mluOpHandle_t handle, int requested_algo_count = 1, return_algo_count = 0; float *workspace; size_t workspace_size; - cnnlGetBatchMatMulAlgoHeuristic( + cnnlGetBatchMatMulExAlgoHeuristic( cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL, requested_algo_count, &heuristic_result, &return_algo_count); - cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size); + cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size); if (workspace_size > 0) { CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); @@ -1840,10 +1840,10 @@ mluOpStatus_t computeFFT2dMatMulRowC2R(mluOpHandle_t handle, CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float))); } - CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha, - cnnl_a_desc, dft_matrix_addr, cnnl_b_desc, - in_addr, &beta, cnnl_c_desc, out_addr, - (void *)workspace, workspace_size)); + CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha, + cnnl_a_desc, dft_matrix_addr, cnnl_b_desc, + in_addr, &beta, cnnl_c_desc, out_addr, + (void *)workspace, workspace_size)); // destroy cnnl descriptor DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc); @@ -1866,23 +1866,23 @@ static mluOpStatus_t makeIRFFT2dContiguousInput(mluOpHandle_t handle, auto status = MLUOP_STATUS_SUCCESS; if ((!fft_plan->is_input_contiguous || (fft_plan->inembed[0] > fft_plan->n[0] || - fft_plan->inembed[1] > fft_plan->n[1] / 2 + 1) && - !fft_plan->prime) && + fft_plan->inembed[1] > fft_plan->n[1] / 2 + 1)) && fft_plan->fft_strategy != CNFFT_FUNC_MANY_DIST1_2D) { VLOG(5) << "launch mluOpContiguous for irfft2d input"; mluOpTensorDescriptor_t input_desc; status = mluOpCreateTensorDescriptor(&input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 3; - int64_t dims[in_dim_num] = { + const int IN_DIM_NUM = 3; + int64_t dims[IN_DIM_NUM] = { fft_plan->batch, std::min(fft_plan->inembed[0], fft_plan->n[0]), std::min(FFT_HALF(fft_plan->n[1]), fft_plan->inembed[1])}; - int64_t strides[in_dim_num] = {fft_plan->idist, - (fft_plan->istride * fft_plan->inembed[1]), - fft_plan->istride}; + int64_t strides[IN_DIM_NUM]; // IN_DIM_NUM + for (int i = 0; i < IN_DIM_NUM; i++) { + strides[i] = fft_plan->in_stride[i]; + } status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->input_dtype, in_dim_num, + fft_plan->input_dtype, IN_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); @@ -1913,18 +1913,19 @@ static mluOpStatus_t makeIRFFT2dContiguousOutput(mluOpHandle_t handle, INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // set up tensor desc - const int out_dim_num = 3; - int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0], + const int OUT_DIM_NUM = 3; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0], fft_plan->n[1]}; - int64_t strides[out_dim_num] = {fft_plan->odist, - fft_plan->ostride * fft_plan->onembed[1], - fft_plan->ostride}; + int64_t strides[OUT_DIM_NUM]; // OUT_DIM_NUM + for (int i = 0; i < OUT_DIM_NUM; i++) { + strides[i] = fft_plan->out_stride[i]; + } status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims); + out_c_dtype, OUT_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims, strides); + out_c_dtype, OUT_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // void *copy_src_addr = fft_plan->matmul_addrs.output_contiguous_addr; @@ -1937,17 +1938,8 @@ static mluOpStatus_t makeIRFFT2dContiguousOutput(mluOpHandle_t handle, DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc, cnnl_copy_dst_desc); - size_t workspace_size = 0; - CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc, - cnnl_copy_dst_desc, &workspace_size)); - - void *workspace = nullptr; - if (workspace_size > 0) { - CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); - } CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr, - cnnl_copy_dst_desc, output, workspace, - workspace_size)); + cnnl_copy_dst_desc, output, NULL, 0)); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc); @@ -1987,63 +1979,92 @@ mluOpStatus_t execIRFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr; } - for (int batch_id = 0; batch_id < fft_plan->batch; batch_id++) { - status = kernelIRFFT2dButterflyColumn(k_dim, k_type, handle->queue, - fft_plan, FFT_IFFT); + if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) { + mluOpTensorDescriptor_t input_desc; + status = mluOpCreateTensorDescriptor(&input_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + const int IN_DIM_NUM = 2; + int64_t dims[IN_DIM_NUM] = { + fft_plan->batch * fft_plan->n[0] * fft_plan->n[1], 1}; + int64_t strides[IN_DIM_NUM] = {2, 1}; + status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, IN_DIM_NUM, + dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - status = kernelIRFFT2dButterflyRow(k_dim, k_type, handle->queue, fft_plan, - FFT_IFFT); + + status = mluOpContiguous(handle, input_desc, fft_plan->mlu_addrs.input, + fft_plan->mlu_addrs.output); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + status = mluOpDestroyTensorDescriptor(input_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + } else { + for (int batch_id = 0; batch_id < fft_plan->batch; batch_id++) { + status = kernelIRFFT2dButterflyColumn(k_dim, k_type, handle->queue, + fft_plan, FFT_IFFT); + + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + status = kernelIRFFT2dButterflyRow(k_dim, k_type, handle->queue, + fft_plan, FFT_IFFT); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + + fft_plan->mlu_addrs.input = + (void *)((uint64_t)(fft_plan->mlu_addrs.input) + idist); + fft_plan->mlu_addrs.output = + (void *)((uint64_t)(fft_plan->mlu_addrs.output) + odist); + } fft_plan->mlu_addrs.input = - (void *)((uint64_t)(fft_plan->mlu_addrs.input) + idist); + (void *)((uint64_t)(fft_plan->mlu_addrs.input) - + fft_plan->batch * idist); fft_plan->mlu_addrs.output = - (void *)((uint64_t)(fft_plan->mlu_addrs.output) + odist); + (void *)((uint64_t)(fft_plan->mlu_addrs.output) - + fft_plan->batch * odist); } - fft_plan->mlu_addrs.input = (void *)((uint64_t)(fft_plan->mlu_addrs.input) - - fft_plan->batch * idist); - fft_plan->mlu_addrs.output = - (void *)((uint64_t)(fft_plan->mlu_addrs.output) - - fft_plan->batch * odist); + } else if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { + status = computeFFT2dMatMulColumnC2R(handle, fft_plan, scale_factor); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - if (scale_factor != 1.0) { - const float alpha[2] = {scale_factor, 0.0}; - const float beta[2] = {0.0, 0.0}; - mluOpTensorDescriptor_t c_desc = nullptr; - status = mluOpCreateTensorDescriptor(&c_desc); - const int out_dim_num = 3; - int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0], - fft_plan->n[1]}; - status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->output_dtype, 3, dims); - status = mluOpSetTensorDescriptorOnchipDataType( - c_desc, fft_plan->execution_dtype); + status = computeFFT2dMatMulRowC2R(handle, fft_plan, scale_factor); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + } - DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, - cnnl_handle); // convert to cnnl_handle + if (scale_factor != 1.0) { + const float alpha[2] = {scale_factor, 0.0}; + const float beta[2] = {0.0, 0.0}; + mluOpTensorDescriptor_t c_desc = nullptr; + status = mluOpCreateTensorDescriptor(&c_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + const int OUT_DIM_NUM = 3; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0], + fft_plan->n[1]}; + status = mluOpSetTensorDescriptor_v2( + c_desc, MLUOP_LAYOUT_ARRAY, fft_plan->output_dtype, OUT_DIM_NUM, dims); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + status = mluOpSetTensorDescriptorOnchipDataType(c_desc, + fft_plan->execution_dtype); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc); + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, + cnnl_handle); // convert to cnnl_handle - CALL_CNNL(cnnlTransform_v2(cnnl_handle, CNNL_POINTER_MODE_HOST, &alpha, - cnnl_output_desc, fft_plan->mlu_addrs.output, - &beta, cnnl_output_desc, - fft_plan->mlu_addrs.output)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); - DESTROY_CNNL_HANDLE(cnnl_handle); - } + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc); + + CALL_CNNL(cnnlTransform_v2(cnnl_handle, CNNL_POINTER_MODE_HOST, &alpha, + cnnl_output_desc, fft_plan->mlu_addrs.output, + &beta, cnnl_output_desc, + fft_plan->mlu_addrs.output)); + status = mluOpDestroyTensorDescriptor(c_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); + DESTROY_CNNL_HANDLE(cnnl_handle); + } + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) { status = makeIRFFT2dContiguousOutput(handle, fft_plan, output, fft_plan->mlu_addrs.output); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - - } else if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { - status = computeFFT2dMatMulColumnC2R(handle, fft_plan, scale_factor); - INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - - status = computeFFT2dMatMulRowC2R(handle, fft_plan, scale_factor); - INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); } return status; } diff --git a/kernels/fft/rfft/rfft_host.cpp b/kernels/fft/rfft/rfft_host.cpp index d0755e8be..9f9c37030 100644 --- a/kernels/fft/rfft/rfft_host.cpp +++ b/kernels/fft/rfft/rfft_host.cpp @@ -434,13 +434,13 @@ static void configureRFFT2dWorkspaceAddrs(mluOpHandle_t handle, size_t out_c_dtype_size = mluOpDataTypeBytes(out_c_dtype); int batch = fft_plan->batch; - int _n0 = fft_plan->n[0]; - int _n1 = fft_plan->n[1]; + int n0_ori = fft_plan->n[0]; + int n1_ori = fft_plan->n[1]; size_t offset = 0; if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { // rr ri ir ii - size_t buffer_size = batch * out_c_dtype_size * _n0 * _n1 * 2; + size_t buffer_size = batch * out_c_dtype_size * n0_ori * n1_ori * 2; fft_plan->mlu_addrs.input = input; fft_plan->mlu_addrs.output = output; fft_plan->mlu_addrs.buffer_in = (uint8_t *)workspace + offset; @@ -451,20 +451,22 @@ static void configureRFFT2dWorkspaceAddrs(mluOpHandle_t handle, if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) { fft_plan->mlu_addrs.buffer_buf = (uint8_t *)workspace + offset; - offset += batch * out_c_dtype_size * _n0 * _n1 * 2; + offset += batch * out_c_dtype_size * n0_ori * n1_ori * 2; - if (fft_plan->is_input_contiguous) { + if ((fft_plan->is_input_contiguous && + fft_plan->inembed[0] <= fft_plan->n[0] && + fft_plan->inembed[1] <= fft_plan->n[1])) { fft_plan->mlu_addrs.input = input; } else { fft_plan->mlu_addrs.input = (uint8_t *)workspace + offset; - offset += batch * out_c_dtype_size * _n0 * _n1; + offset += batch * out_c_dtype_size * n0_ori * n1_ori; } if (fft_plan->is_output_contiguous) { fft_plan->mlu_addrs.output = output; } else { fft_plan->mlu_addrs.output = (uint8_t *)workspace + offset; - offset += batch * out_c_dtype_size * _n0 * _n1; + offset += batch * out_c_dtype_size * n0_ori * n1_ori; } } @@ -707,12 +709,12 @@ static mluOpStatus_t makeRFFT1dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 2; + const int IN_DIM_NUM = 2; if (fft_plan->prime) { - int64_t dims[in_dim_num] = {fft_plan->batch, fft_plan->inembed[0]}; - int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride}; + int64_t dims[IN_DIM_NUM] = {fft_plan->batch, fft_plan->inembed[0]}; + int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride}; status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->input_dtype, in_dim_num, + fft_plan->input_dtype, IN_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); @@ -724,11 +726,11 @@ static mluOpStatus_t makeRFFT1dContiguousInput(mluOpHandle_t handle, INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); } else { - int64_t dims[in_dim_num] = { + int64_t dims[IN_DIM_NUM] = { fft_plan->batch, std::min(fft_plan->inembed[0], fft_plan->n[0])}; - int64_t strides[in_dim_num] = {fft_plan->idist, fft_plan->istride}; + int64_t strides[IN_DIM_NUM] = {fft_plan->idist, fft_plan->istride}; status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->input_dtype, in_dim_num, + fft_plan->input_dtype, IN_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); @@ -762,15 +764,15 @@ static mluOpStatus_t padRFFT1dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&padded_input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 2; - int64_t dims[in_dim_num] = {batch, fft_plan->inembed[0]}; + const int IN_DIM_NUM = 2; + int64_t dims[IN_DIM_NUM] = {batch, fft_plan->inembed[0]}; status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, dims); + in_r_dtype, IN_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - int64_t padded_dims[in_dim_num] = {batch, n}; + int64_t padded_dims[IN_DIM_NUM] = {batch, n}; status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, padded_dims); + in_r_dtype, IN_DIM_NUM, padded_dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); const int pad_dim_num = 4; @@ -818,16 +820,16 @@ static mluOpStatus_t padRFFT2dContiguousInput(mluOpHandle_t handle, status = mluOpCreateTensorDescriptor(&padded_input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 3; - int64_t dims[in_dim_num] = {batch, std::min(n0, fft_plan->inembed[0]), + const int IN_DIM_NUM = 3; + int64_t dims[IN_DIM_NUM] = {batch, std::min(n0, fft_plan->inembed[0]), std::min(n1, fft_plan->inembed[1])}; status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, dims); + in_r_dtype, IN_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - int64_t padded_dims[in_dim_num] = {batch, n0, n1}; + int64_t padded_dims[IN_DIM_NUM] = {batch, n0, n1}; status = mluOpSetTensorDescriptor_v2(padded_input_desc, MLUOP_LAYOUT_ARRAY, - in_r_dtype, in_dim_num, padded_dims); + in_r_dtype, IN_DIM_NUM, padded_dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); const int pad_dim_num = 6; @@ -1083,17 +1085,17 @@ static mluOpStatus_t makeRFFT1dContiguousOutput(mluOpHandle_t handle, INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // set up tensor desc - const int out_dim_num = 2; - int64_t dims[out_dim_num] = { + const int OUT_DIM_NUM = 2; + int64_t dims[OUT_DIM_NUM] = { fft_plan->batch, (fft_plan->prime) ? fft_plan->onembed[0] : (fft_plan->n[0] / 2 + 1)}; - int64_t strides[out_dim_num] = {fft_plan->odist, fft_plan->ostride}; + int64_t strides[OUT_DIM_NUM] = {fft_plan->odist, fft_plan->ostride}; status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims); + out_c_dtype, OUT_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims, strides); + out_c_dtype, OUT_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // copy @@ -1109,17 +1111,8 @@ static mluOpStatus_t makeRFFT1dContiguousOutput(mluOpHandle_t handle, DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc, cnnl_copy_dst_desc); - size_t workspace_size = 0; - CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc, - cnnl_copy_dst_desc, &workspace_size)); - - void *workspace = nullptr; - if (workspace_size > 0) { - CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); - } CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr, - cnnl_copy_dst_desc, output, workspace, - workspace_size)); + cnnl_copy_dst_desc, output, NULL, 0)); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc); @@ -1138,26 +1131,26 @@ static mluOpStatus_t makeRFFT2dContiguousInput(mluOpHandle_t handle, auto status = MLUOP_STATUS_SUCCESS; if ((!fft_plan->is_input_contiguous || (fft_plan->inembed[0] > fft_plan->n[0] || - fft_plan->inembed[1] > fft_plan->n[1]) && - !fft_plan->prime) && + fft_plan->inembed[1] > fft_plan->n[1])) && fft_plan->fft_strategy != CNFFT_FUNC_MANY_DIST1_2D) { VLOG(5) << "launch mluOpContiguous for rfft2d input"; mluOpTensorDescriptor_t input_desc; status = mluOpCreateTensorDescriptor(&input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - const int in_dim_num = 3; - int64_t dims[in_dim_num] = { + const int IN_DIM_NUM = 3; + int64_t dims[IN_DIM_NUM] = { fft_plan->batch, fft_plan->n[0] > fft_plan->inembed[0] ? fft_plan->inembed[0] : fft_plan->n[0], fft_plan->n[1] > fft_plan->inembed[1] ? fft_plan->inembed[1] : fft_plan->n[1]}; - int64_t strides[in_dim_num] = {fft_plan->idist, - (fft_plan->istride * fft_plan->inembed[1]), - fft_plan->istride}; + int64_t strides[IN_DIM_NUM]; // IN_DIM_NUM + for (int i = 0; i < IN_DIM_NUM; i++) { + strides[i] = fft_plan->in_stride[i]; + } status = mluOpSetTensorDescriptorEx_v2(input_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->input_dtype, in_dim_num, + fft_plan->input_dtype, IN_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); @@ -1188,18 +1181,19 @@ static mluOpStatus_t makeRFFT2dContiguousOutput(mluOpHandle_t handle, INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); // set up tensor desc - const int out_dim_num = 3; - int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0], + const int OUT_DIM_NUM = 3; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0], fft_plan->n[1] / 2 + 1}; - int64_t strides[out_dim_num] = {fft_plan->odist, - fft_plan->ostride * fft_plan->onembed[1], - fft_plan->ostride}; + int64_t strides[OUT_DIM_NUM]; // OUT_DIM_NUM + for (int i = 0; i < OUT_DIM_NUM; i++) { + strides[i] = fft_plan->out_stride[i]; + } status = mluOpSetTensorDescriptor_v2(copy_src_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims); + out_c_dtype, OUT_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorEx_v2(copy_dst_desc, MLUOP_LAYOUT_ARRAY, - out_c_dtype, out_dim_num, dims, strides); + out_c_dtype, OUT_DIM_NUM, dims, strides); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); void *copy_src_addr = fft_plan->mlu_addrs.output; @@ -1211,17 +1205,8 @@ static mluOpStatus_t makeRFFT2dContiguousOutput(mluOpHandle_t handle, DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(copy_dst_desc, cnnl_copy_dst_desc); - size_t workspace_size = 0; - CALL_CNNL(cnnlGetCopyWorkspaceSize(cnnl_handle, cnnl_copy_src_desc, - cnnl_copy_dst_desc, &workspace_size)); - - void *workspace = nullptr; - if (workspace_size > 0) { - CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); - } CALL_CNNL(cnnlCopy_v2(cnnl_handle, cnnl_copy_src_desc, copy_src_addr, - cnnl_copy_dst_desc, output, workspace, - workspace_size)); + cnnl_copy_dst_desc, output, NULL, 0)); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_src_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_copy_dst_desc); @@ -1287,12 +1272,16 @@ mluOpStatus_t execRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, const float beta[2] = {0.0, 0.0}; mluOpTensorDescriptor_t c_desc = nullptr; status = mluOpCreateTensorDescriptor(&c_desc); - const int out_dim_num = 2; - int64_t dims[out_dim_num] = {fft_plan->batch, (fft_plan->n[0] / 2 + 1)}; + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + const int OUT_DIM_NUM = 2; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, (fft_plan->n[0] / 2 + 1)}; status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->output_dtype, 2, dims); + fft_plan->output_dtype, OUT_DIM_NUM, + dims); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = mluOpSetTensorDescriptorOnchipDataType( c_desc, fft_plan->execution_dtype); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); // convert to cnnl_handle @@ -1303,6 +1292,8 @@ mluOpStatus_t execRFFT1d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, cnnl_output_desc, fft_plan->mlu_addrs.output, &beta, cnnl_output_desc, fft_plan->mlu_addrs.output)); + status = mluOpDestroyTensorDescriptor(c_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); DESTROY_CNNL_HANDLE(cnnl_handle); } @@ -1511,21 +1502,21 @@ mluOpStatus_t computeFFT2dMatMulRowR2C(mluOpHandle_t handle, int requested_algo_count = 1, return_algo_count = 0; float *workspace; size_t workspace_size; - cnnlGetBatchMatMulAlgoHeuristic( + cnnlGetBatchMatMulExAlgoHeuristic( cnnl_handle, bmm_bcast_desc, cnnl_a_desc, cnnl_b_desc, cnnl_c_desc, NULL, requested_algo_count, &heuristic_result, &return_algo_count); - cnnlGetBatchMatMulHeuristicResult(heuristic_result, algo, &workspace_size); + cnnlGetBatchMatMulExHeuristicResult(heuristic_result, algo, &workspace_size); if (workspace_size > 0) { CNRT_CHECK(cnrtMalloc((void **)&workspace, workspace_size)); } else { CNRT_CHECK(cnrtMalloc((void **)&workspace, m * n * sizeof(float))); } - CALL_CNNL(cnnlBatchMatMulBCast_v2(cnnl_handle, bmm_bcast_desc, algo, &alpha, - cnnl_a_desc, dft_matrix_addr, cnnl_b_desc, - in_addr, &beta, cnnl_c_desc, out_addr, - (void *)workspace, workspace_size)); + CALL_CNNL(cnnlBatchMatMulEx(cnnl_handle, bmm_bcast_desc, algo, &alpha, + cnnl_a_desc, dft_matrix_addr, cnnl_b_desc, + in_addr, &beta, cnnl_c_desc, out_addr, + (void *)workspace, workspace_size)); // destroy cnnl descriptor DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_a_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_b_desc); @@ -1561,70 +1552,109 @@ mluOpStatus_t execRFFT2d(mluOpHandle_t handle, const mluOpFFTPlan_t fft_plan, status = makeRFFT2dContiguousInput(handle, fft_plan, input); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - if (fft_plan->n[0] > fft_plan->inembed[0] || - fft_plan->n[1] > fft_plan->inembed[1]) { - status = padRFFT2dContiguousInput(handle, fft_plan); + if (fft_plan->n[0] == 1 && fft_plan->n[1] == 1) { + mluOpTensorDescriptor_t input_desc, padded_output_desc; + status = mluOpCreateTensorDescriptor(&input_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - - fft_plan->mlu_addrs.input = fft_plan->mlu_addrs.input_pad_addr; - } - - for (int batch_id = 0; batch_id < fft_plan->batch; batch_id++) { - status = kernelRFFT2dButterflyRow(k_dim, k_type, handle->queue, fft_plan, - RFFT); - + status = mluOpCreateTensorDescriptor(&padded_output_desc); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - status = kernelRFFT2dButterflyColumn(k_dim, k_type, handle->queue, - fft_plan, FFT_IFFT); + const int IN_DIM_NUM = 2; + int64_t dims[IN_DIM_NUM] = {fft_plan->batch, + fft_plan->n[0] * fft_plan->n[1]}; + status = mluOpSetTensorDescriptor_v2(input_desc, MLUOP_LAYOUT_ARRAY, + MLUOP_DTYPE_FLOAT, IN_DIM_NUM, dims); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); - fft_plan->mlu_addrs.input = - (void *)((uint64_t)(fft_plan->mlu_addrs.input) + idist); - fft_plan->mlu_addrs.output = - (void *)((uint64_t)(fft_plan->mlu_addrs.output) + odist); - } - fft_plan->mlu_addrs.input = (void *)((uint64_t)(fft_plan->mlu_addrs.input) - - fft_plan->batch * idist); - fft_plan->mlu_addrs.output = - (void *)((uint64_t)(fft_plan->mlu_addrs.output) - - fft_plan->batch * odist); - - if (scale_factor != 1.0) { - const float alpha[2] = {scale_factor, 0.0}; - const float beta[2] = {0.0, 0.0}; - mluOpTensorDescriptor_t c_desc = nullptr; - status = mluOpCreateTensorDescriptor(&c_desc); - const int out_dim_num = 3; - int64_t dims[out_dim_num] = {fft_plan->batch, fft_plan->n[0], - fft_plan->n[1] / 2 + 1}; - status = mluOpSetTensorDescriptor_v2(c_desc, MLUOP_LAYOUT_ARRAY, - fft_plan->output_dtype, 3, dims); - status = mluOpSetTensorDescriptorOnchipDataType( - c_desc, fft_plan->execution_dtype); + int64_t padded_dims[IN_DIM_NUM] = {fft_plan->batch, + fft_plan->n[0] * fft_plan->n[1] * 2}; + status = mluOpSetTensorDescriptor_v2( + padded_output_desc, MLUOP_LAYOUT_ARRAY, MLUOP_DTYPE_FLOAT, IN_DIM_NUM, + padded_dims); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + const int pad_dim_num = 4; + int paddings[pad_dim_num] = {0, 0, 0, 1}; + uint64_t padding_value = 0x00000000; DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, cnnl_handle); // convert to cnnl_handle - DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(padded_output_desc, + cnnl_padded_output_desc); + CALL_CNNL(cnnlPad(cnnl_handle, cnnl_input_desc, fft_plan->mlu_addrs.input, + paddings, &padding_value, cnnl_padded_output_desc, + fft_plan->mlu_addrs.output)); + + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_padded_output_desc); - CALL_CNNL(cnnlTransform_v2(cnnl_handle, CNNL_POINTER_MODE_HOST, &alpha, - cnnl_output_desc, fft_plan->mlu_addrs.output, - &beta, cnnl_output_desc, - fft_plan->mlu_addrs.output)); - DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); DESTROY_CNNL_HANDLE(cnnl_handle); - } - INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + } else { + for (int batch_id = 0; batch_id < fft_plan->batch; batch_id++) { + status = kernelRFFT2dButterflyRow(k_dim, k_type, handle->queue, + fft_plan, RFFT); - status = makeRFFT2dContiguousOutput(handle, fft_plan, output); - INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + status = kernelRFFT2dButterflyColumn(k_dim, k_type, handle->queue, + fft_plan, FFT_IFFT); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + + fft_plan->mlu_addrs.input = + (void *)((uint64_t)(fft_plan->mlu_addrs.input) + idist); + fft_plan->mlu_addrs.output = + (void *)((uint64_t)(fft_plan->mlu_addrs.output) + odist); + } + fft_plan->mlu_addrs.input = + (void *)((uint64_t)(fft_plan->mlu_addrs.input) - + fft_plan->batch * idist); + fft_plan->mlu_addrs.output = + (void *)((uint64_t)(fft_plan->mlu_addrs.output) - + fft_plan->batch * odist); + } } else if (fft_plan->fft_strategy == CNFFT_FUNC_MANY_DIST1_2D) { status = computeFFT2dMatMulRowR2C(handle, fft_plan, scale_factor); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); status = computeFFT2dMatMulColumnR2C(handle, fft_plan, scale_factor); INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); } + + if (scale_factor != 1.0) { + const float alpha[2] = {scale_factor, 0.0}; + const float beta[2] = {0.0, 0.0}; + mluOpTensorDescriptor_t c_desc = nullptr; + status = mluOpCreateTensorDescriptor(&c_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + const int OUT_DIM_NUM = 3; + int64_t dims[OUT_DIM_NUM] = {fft_plan->batch, fft_plan->n[0], + fft_plan->n[1] / 2 + 1}; + status = mluOpSetTensorDescriptor_v2( + c_desc, MLUOP_LAYOUT_ARRAY, fft_plan->output_dtype, OUT_DIM_NUM, dims); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + status = mluOpSetTensorDescriptorOnchipDataType(c_desc, + fft_plan->execution_dtype); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + + DEFINE_CREATE_AND_SET_CNNL_HANDLE(handle, + cnnl_handle); // convert to cnnl_handle + + DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(c_desc, cnnl_output_desc); + + CALL_CNNL(cnnlTransform_v2(cnnl_handle, CNNL_POINTER_MODE_HOST, &alpha, + cnnl_output_desc, fft_plan->mlu_addrs.output, + &beta, cnnl_output_desc, + fft_plan->mlu_addrs.output)); + status = mluOpDestroyTensorDescriptor(c_desc); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_output_desc); + DESTROY_CNNL_HANDLE(cnnl_handle); + } + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + + if (fft_plan->fft_strategy == CNFFT_FUNC_TWO_LEVEL_STOCKHAM) { + status = makeRFFT2dContiguousOutput(handle, fft_plan, output); + INTERNAL_CHECK(api, status == MLUOP_STATUS_SUCCESS); + } return status; } diff --git a/kernels/tensor_stride_process/tensor_stride_process_host.cpp b/kernels/tensor_stride_process/tensor_stride_process_host.cpp index 410112258..bcb9685a6 100644 --- a/kernels/tensor_stride_process/tensor_stride_process_host.cpp +++ b/kernels/tensor_stride_process/tensor_stride_process_host.cpp @@ -484,7 +484,8 @@ mluOpContiguous(mluOpHandle_t handle, const mluOpTensorDescriptor_t input_desc, DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(input_desc, cnnl_input_desc); DEFINE_CREATE_AND_SET_CNNL_TENSOR_DESCRIPTOR(temp_desc, cnnl_temp_desc); CALL_CNNL( - cnnlCopy(cnnl_handle, cnnl_input_desc, input, cnnl_temp_desc, output)); + cnnlCopy_v2(cnnl_handle, cnnl_input_desc, input, cnnl_temp_desc, output, + NULL, 0)); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_input_desc); DESTROY_CNNL_TENSOR_DESCRIPTOR(cnnl_temp_desc); DESTROY_CNNL_HANDLE(cnnl_handle); From 6413b159af36cfed811f223bf0207a46d917d25c Mon Sep 17 00:00:00 2001 From: duzekun Date: Mon, 9 Dec 2024 10:32:19 +0800 Subject: [PATCH 5/7] [Docs](mlu-ops): Update docs for v1.4.1 (#1175) Co-authored-by: duzekun --- .github/workflows/daily.yaml | 4 ++-- .github/workflows/mluops_ci.yaml | 2 +- README.md | 5 +++-- build.property | 4 ++-- docs/api_guide/update.rst | 8 ++++++++ docs/release_notes/mlu_ops.rst | 18 ++++++++++++++++++ docs/user_guide/2_update_history/index.rst | 7 +++++++ .../centos7.5/SPECS/mluops-independent.spec | 4 +++- installer/independent/debian/changelog | 8 +++++++- mlu_op.h | 2 +- 10 files changed, 52 insertions(+), 10 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index f5b22c50e..c74c11930 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -12,9 +12,9 @@ jobs: strategy: matrix: runner: [mlu370-m8] - mlu_ops_version : [1.4.0] + mlu_ops_version : [1.4.1] cntoolkit_version : [3.15.2] - cnnl_version: [1.27.4] + cnnl_version: [1.28.0] runs-on: ${{matrix.runner}} steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/mluops_ci.yaml b/.github/workflows/mluops_ci.yaml index 5d2a7b5a1..395e074e8 100644 --- a/.github/workflows/mluops_ci.yaml +++ b/.github/workflows/mluops_ci.yaml @@ -39,7 +39,7 @@ jobs: strategy: matrix: runner: [mlu370-m8] - mlu_ops_version : [v1.4.0] + mlu_ops_version : [v1.4.1] runs-on: [yellow] steps: - uses: actions/checkout@v3 diff --git a/README.md b/README.md index a1d8e75e5..9e4cf450c 100644 --- a/README.md +++ b/README.md @@ -49,8 +49,9 @@ MLU-OPS™提供了以下功能: ## 依赖条件 - 操作系统: - - 支持 x86_64 架构下的 Ubuntu20.04、Centos7.6、Centos8.5、Kylin10 - - MLU-OPS™ v1.0.0版本后将不再支持 Ubuntu18.04。Ubuntu22.04系统将在后续的版本提供支持。 + - 支持 x86_64 架构下的 Ubuntu22.04、Centos7.6、Centos8.5、Kylin10 + - MLU-OPS™ v1.0.0版本后将不再支持 Ubuntu18.04。 + - MLU-OPS™ v1.4.1版本后将不再支持 Ubuntu20.04。 - 寒武纪 MLU SDK: - 编译和运行时依赖 CNToolkit v3.15.2 或更高版本,CNNL v1.27.4 或者更高版本 - 寒武纪 MLU 驱动: diff --git a/build.property b/build.property index 3351c140a..faaf3ea93 100644 --- a/build.property +++ b/build.property @@ -1,8 +1,8 @@ { - "version": "1.4.0-1", + "version": "1.4.1-1", "python": "3.6.0", "build_requires": {"cntoolkit": ["release","3.15.2-1"], - "cnnl":["release","1.27.4-1"], + "cnnl":["release","1.28.0-1"], "driver": "6.0.3", "eigen3": "3.4.0", "libxml2": "2.9.0", diff --git a/docs/api_guide/update.rst b/docs/api_guide/update.rst index 3c60f6b86..f6afd881c 100755 --- a/docs/api_guide/update.rst +++ b/docs/api_guide/update.rst @@ -3,6 +3,14 @@ Update History This section lists contents that were made for each product release. +* V1.4.1 + + **Date:** December 5, 2024 + + **Changes:** + + - None. + * V1.4.0 **Date:** November 29, 2024 diff --git a/docs/release_notes/mlu_ops.rst b/docs/release_notes/mlu_ops.rst index 90cca642c..9766f0526 100644 --- a/docs/release_notes/mlu_ops.rst +++ b/docs/release_notes/mlu_ops.rst @@ -64,6 +64,24 @@ Cambricon MLU-OPS具有以下特点: +-----------------------------+------------------------+--------------------------------+ +v1.4.1 +----------------- + +特性变更 +~~~~~~~~~~~~~~~~~~~~~ + +- 无。 + +已修复问题 +~~~~~~~~~~~~~~~~~~~~~ + +- 无。 + +已知遗留问题 +~~~~~~~~~~~~~~~~~~~~~ + +- 无。 + v1.4.0 ----------------- diff --git a/docs/user_guide/2_update_history/index.rst b/docs/user_guide/2_update_history/index.rst index 127da7bac..158f05b06 100644 --- a/docs/user_guide/2_update_history/index.rst +++ b/docs/user_guide/2_update_history/index.rst @@ -1,6 +1,13 @@ 更新历史 ======== +* **V1.4.1** + **更新时间**:2024年12月5日 + + **更新内容**: + + - 无算子更新。 + * **V1.4.0** **更新时间**:2024年11月29日 diff --git a/installer/centos7.5/SPECS/mluops-independent.spec b/installer/centos7.5/SPECS/mluops-independent.spec index 92f8a6761..d9360e083 100644 --- a/installer/centos7.5/SPECS/mluops-independent.spec +++ b/installer/centos7.5/SPECS/mluops-independent.spec @@ -5,7 +5,7 @@ Name: mluops Summary: The Machine Lerning Unit OPerators -Version: 1.4.0 +Version: 1.4.1 Release: 1%{?dist} License: Cambricon Release License Vendor: Cambricon Inc. @@ -64,6 +64,8 @@ cp $RPM_SOURCE_DIR/neuware-env.conf $RPM_BUILD_ROOT/etc/ld.so.conf.d/ %postun -p /sbin/ldconfig %changelog +* Thu Dec 5 2024 Cambricon Software Team +- release mluops v1.4.1 * Thu Nov 29 2024 Cambricon Software Team - release mluops v1.4.0 * Mon Oct 21 2024 Cambricon Software Team diff --git a/installer/independent/debian/changelog b/installer/independent/debian/changelog index 67ffb475d..82380f29f 100644 --- a/installer/independent/debian/changelog +++ b/installer/independent/debian/changelog @@ -1,8 +1,14 @@ +mluops (1.4.1-1.ubuntu16.04) xenial; urgency=medium + + * Release mluops v1.4.1 + + -- Cambricon Thu, 5 Dec 2024 00:00:00 +0100 + mluops (1.4.0-1.ubuntu16.04) xenial; urgency=medium * Release mluops v1.4.0 - -- Cambricon Thu, 29 Nov 2024 00:00:00 +0100 + -- Cambricon Fri, 29 Nov 2024 00:00:00 +0100 mluops (1.3.2-1.ubuntu16.04) xenial; urgency=medium diff --git a/mlu_op.h b/mlu_op.h index 5d8115c19..0dae7b1a4 100644 --- a/mlu_op.h +++ b/mlu_op.h @@ -29,7 +29,7 @@ #define MLUOP_MAJOR 1 #define MLUOP_MINOR 4 -#define MLUOP_PATCHLEVEL 0 +#define MLUOP_PATCHLEVEL 1 /********************************************************************************* * MLUOP_VERSION is deprecated and not recommended. To get the version of MLUOP, use * MLUOP_MAJOR, MLUOP_MINOR and MLUOP_PATCHLEVEL. From 639cc78773269e3e6e126e71602921b9d1ecf727 Mon Sep 17 00:00:00 2001 From: duzekun Date: Thu, 12 Dec 2024 16:02:54 +0800 Subject: [PATCH 6/7] [Feature](mlu-ops): Support mtp_613 (#1176) --- core/context.cpp | 47 ++++++++++---------- core/context.h | 1 + independent_build.sh | 4 ++ test/mlu_op_gtest/pb_gtest/mlu_op_test_proto | 2 +- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/core/context.cpp b/core/context.cpp index dc1cea168..61ea8ba32 100644 --- a/core/context.cpp +++ b/core/context.cpp @@ -29,9 +29,9 @@ #include "core/tool.h" #include "kernels/kernel.h" -#define DEP_CHECK_LOG(level) \ +#define DEP_CHECK_LOG(level) \ mluop::logging::LogMessage(__FILE__, __LINE__, 4, level, "MLU-OPS", true, \ - true, true, true) \ + true, true, true) \ .stream() namespace mluop { @@ -46,27 +46,23 @@ static struct deviceName name_list_table[] = { // case. }; -// once cnrtGetDeviceProperties() update and not use -// device_ordinal, update this funciton. -mluOpDevType_t convertDeviceName(char *name) { - struct deviceName *pName = NULL; - int num = sizeof(name_list_table) / sizeof(struct deviceName); - if (CONTEXT_DEVICENAME_LEAST_SIZE > strlen(name)) { - LOG(ERROR) - << "get device name failed. device name too short. device name = " - << name << "\n"; - return MLUOP_UNKNOWN_DEVICE; - } - for (int i = 0; i < num; i++) { - pName = &name_list_table[i]; - if (0 == strncmp(pName->name, name, strlen(pName->name)) || - (i == num - 1 && - 0 >= strncmp(pName->name, name, CONTEXT_DEVICENAME_LEAST_SIZE))) { - return pName->type; +mluOpDevType_t convertDeviceNameFromInt(int device_code) { + switch (device_code) { + case 372: { + return MLUOP_MLU370; + break; + } + case 592: { + return MLUOP_MLU590; + break; + } + case 613: { + return MLUOP_MTP613; + break; } + default: + break; } - LOG(ERROR) << "get device name failed. return unknown device. device name = " - << name << "\n"; return MLUOP_UNKNOWN_DEVICE; } } // namespace mluop @@ -179,6 +175,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreate(mluOpHandle_t *handle) { int32_t persisting_l2cache_maxsize = 0; double memory_band_width = 0; char device_name[CONTEXT_DEVICENAME_BUFFER_SIZE] = ""; + int device_code = 0; mluOpContext *ctx = new (std::nothrow) mluOpContext(); CNcontext drv_ctx; CNctxConfigParam ctx_conf_param; @@ -246,6 +243,11 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreate(mluOpHandle_t *handle) { cnDeviceGetAttribute(&persisting_l2cache_maxsize, CN_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE, mlu_dev)); + INTERNAL_CHECK( + "[mluOpCreate]", + CN_SUCCESS == cnDeviceGetAttribute(&device_code, + CN_DEVICE_ATTRIBUTE_MLU_ISA_VERSION, + mlu_dev)); INTERNAL_CHECK( "[mluOpCreate]", CN_SUCCESS == cnDeviceGetName(device_name, CONTEXT_DEVICENAME_BUFFER_SIZE, @@ -266,8 +268,7 @@ mluOpStatus_t MLUOP_WIN_API mluOpCreate(mluOpHandle_t *handle) { } ctx->capability_job_limit = (int32_t)ctx_conf_param.unionLimit; - ctx->arch = mluop::convertDeviceName( - device_name); // warning: possible return unknown. + ctx->arch = mluop::convertDeviceNameFromInt(device_code); ctx->sram_size = sram_size - REM_FOR_STACK; strncpy(ctx->device_name, device_name, sizeof(device_name)); diff --git a/core/context.h b/core/context.h index ab9fa9cae..e30fa94d9 100644 --- a/core/context.h +++ b/core/context.h @@ -57,6 +57,7 @@ typedef enum { MLUOP_MLU270 = 270, MLUOP_MLU370 = 372, MLUOP_MLU590 = 592, + MLUOP_MTP613 = 613, MLUOP_MLU290 = 290, } mluOpDevType_t; diff --git a/independent_build.sh b/independent_build.sh index 3676e8213..a601c5577 100755 --- a/independent_build.sh +++ b/independent_build.sh @@ -51,6 +51,7 @@ long_args=( help mlu370 # mlu arch mlu590 + mtp613 no_prepare perf prepare @@ -68,6 +69,9 @@ add_mlu_arch_support () { --mlu590) bang_arch="mtp_592;" ;; + --mtp613) + bang_arch="mtp_613;" + ;; *) ;; esac diff --git a/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto b/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto index 55d028c4b..ce9149b87 160000 --- a/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto +++ b/test/mlu_op_gtest/pb_gtest/mlu_op_test_proto @@ -1 +1 @@ -Subproject commit 55d028c4b2c79d594c1b6cfb04e60ec646c93bd8 +Subproject commit ce9149b87135a21eeac1df2f2d34219af3a0f41b From acbe8c2043389126e577303167374f7b494d3566 Mon Sep 17 00:00:00 2001 From: chqy99 <141810829+chqy99@users.noreply.github.com> Date: Thu, 12 Dec 2024 16:51:51 +0800 Subject: [PATCH 7/7] [Fix](mluOpRoiAlignRotatedForward): fix race_mem error (#1178) --- .../roi_align_rotated_forward_vector.md | 17 +++++++++-------- .../roi_align_rotated_forward_vector.mlu | 2 ++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/docs/design_docs/roi_align_rotated/roi_align_rotated_forward_vector.md b/docs/design_docs/roi_align_rotated/roi_align_rotated_forward_vector.md index 233aefd2f..b7283cc83 100644 --- a/docs/design_docs/roi_align_rotated/roi_align_rotated_forward_vector.md +++ b/docs/design_docs/roi_align_rotated/roi_align_rotated_forward_vector.md @@ -333,6 +333,8 @@ __mlu_func__ void bilinearInterpolatePosWeight( w3[i] += w3[j]; w4[i] += w4[j]; w1[j] = -1; + } else { + break; } } if (unique_num != i) { @@ -386,14 +388,13 @@ bin_hw_order_num = bin_order_num ^ 2。
| pos4 | sizeof(uint) * bin_hw_order_num | pos4 坐标 | -剩余空间对齐均分为三份 vi, vi_t, val,记空间大小为 max_v_size。
-其中 vi 复用多次,最终的 val_sum 也存储于 vi 中。
-此时 max_once_c = max_v_size / unique_num / sizeof(T)。
+剩余空间对齐均分为两份 val, v_t,记空间大小为 max_v_size。
+此时 max_once_c = max_v_size / 4 / unique_num / sizeof(T)。
以float 类型为例: -- 若 bin_order_num 为 32,固定的 size 为 53376, max_vi_size 为 113280。 -unique_num 最大可到 bin_hw_order_num(1024),此时 max_once_c = 27。 -- 若 bin_order_num 为 8,固定的 size 为 7296, max_vi_size 为 128640。 -unique_num 最大可到 bin_hw_order_num(64),此时 max_once_c = 502。 +- 若 bin_order_num 为 32,固定的 size 为 53376, max_vi_size 为 169920。 +unique_num 最大可到 bin_hw_order_num(1024),此时 max_once_c = 10。 +- 若 bin_order_num 为 8,固定的 size 为 7296, max_vi_size 为 192960。 +unique_num 最大可到 bin_hw_order_num(64),此时 max_once_c = 188。 ### 3.4 性能优化设计 @@ -401,7 +402,7 @@ unique_num 最大可到 bin_hw_order_num(64),此时 max_once_c = 502。 2.减少重复计算,例如:roi_info 计算,bin_h、bin_w 二维序列构建等。 3.使用 fuse.nram 融合三条以上的乘加法。 4.双线性插值坐标进行查重,减少 IO 的数量。 - +5.将周围四个点坐标搬运成连续向量,gather时一次性处理,在有效点较少时能提升 IO 效率。 ### 3.5 可维护性设计 diff --git a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu index e8c545e04..5cc589956 100644 --- a/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu +++ b/kernels/roi_align_rotated/roi_align_rotated_forward_vector.mlu @@ -390,6 +390,7 @@ __mlu_global__ void roiAlignRotatedForward( if (params.sample_ratio < bin_order_num) { construct_order = false; // construct bin_w_idx in bin_loop + __sync(); __memcpy_async(bin_w_order, order, params.sample_ratio * sizeof(T), NRAM2NRAM, params.sample_ratio * sizeof(T), 0, params.sample_ratio - 1); @@ -449,6 +450,7 @@ __mlu_global__ void roiAlignRotatedForward( if (construct_order) { // construct bin_w_idx in bin_loop + __sync(); __memcpy_async(bin_w_order, order, deal_bin_w * sizeof(T), NRAM2NRAM, deal_bin_w * sizeof(T), 0, deal_bin_h - 1);