diff --git a/kernels/box_iou_rotated/box_iou_rotated_utils.h b/kernels/box_iou_rotated/box_iou_rotated_utils.h index 7c3e8d270..22aa3e0ec 100644 --- a/kernels/box_iou_rotated/box_iou_rotated_utils.h +++ b/kernels/box_iou_rotated/box_iou_rotated_utils.h @@ -24,6 +24,7 @@ #define KERNELS_BOX_IOU_ROTATED_BOX_IOU_ROTATED_UTILS_H_ #include "kernels/utils/common.h" +#include "kernels/utils/scatter_gather.h" #define FIILED_ONES (int)0xffffffff #define HALF_FILLED_ONES (int16_t)0xffff @@ -590,21 +591,22 @@ __mlu_func__ void convexHullGraham( sizeof(T), actual_compute_box_num); // get the ordered points according to the angle value - __gather(ordered_pts_x + (i + 1) * actual_compute_box_num, intersect_pts_x, - (unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T), - actual_compute_box_num); - __gather(ordered_pts_y + (i + 1) * actual_compute_box_num, intersect_pts_y, - (unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T), - actual_compute_box_num); - __gather(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts, - (unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T), - actual_compute_box_num); + __mluop_gather(ordered_pts_x + (i + 1) * actual_compute_box_num, + intersect_pts_x, (unsigned int *)temp_offset, NULL, + sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num); + __mluop_gather(ordered_pts_y + (i + 1) * actual_compute_box_num, + intersect_pts_y, (unsigned int *)temp_offset, NULL, + sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num); + __mluop_gather(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts, + (unsigned int *)temp_offset, NULL, sizeof(T), NRAM2NRAM, + sizeof(T), actual_compute_box_num); // assign a invalid value to the point which has been get ordered - __scatter(temp_long_2, temp1_ram, (unsigned int *)temp_offset, sizeof(T), - NRAM2NRAM, sizeof(T), actual_compute_box_num); - __scatter(valid_pts, temp2_ram, (unsigned int *)temp_offset, sizeof(T), - NRAM2NRAM, sizeof(T), actual_compute_box_num); + __mluop_scatter(temp_long_2, temp1_ram, (unsigned int *)temp_offset, + NULL, sizeof(T), NRAM2NRAM, sizeof(T), + actual_compute_box_num); + __mluop_scatter(valid_pts, temp2_ram, (unsigned int *)temp_offset, NULL, + sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num); } __bang_move(valid_pts, temp_long_1, total_points * sizeof(T)); #else diff --git a/kernels/utils/scatter_gather.h b/kernels/utils/scatter_gather.h new file mode 100644 index 000000000..2e1dc567c --- /dev/null +++ b/kernels/utils/scatter_gather.h @@ -0,0 +1,89 @@ +/************************************************************************* + * Copyright (C) [2024] by Cambricon, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ + +#include "kernels/kernel.h" + +#define SCATTER_GATHER_PARAMS \ + void *dst, void *src, uint32_t *offset, void *mask, \ + const uint32_t transfer_size, const mluMemcpyDirection_t dir, \ + const uint32_t stride, const uint32_t data_num + +#if __BANG_ARCH__ > 592 +#define MLUOP_SCATTER_GATHER(func, is_scatter) \ + template \ + __mlu_func__ void __mluop_##func(SCATTER_GATHER_PARAMS) { \ + if (data_num <= UINT16_MAX) { \ + __##func(dst, src, offset, transfer_size, dir, stride, data_num); \ + } else { \ + uint16_t data_num_new = PAD_DOWN(UINT16_MAX, 64); \ + uint32_t repeat = data_num / data_num_new; \ + uint32_t remain = data_num % data_num_new; \ + uint32_t *offset_new = offset; \ + T *dst_new = (T *)dst; \ + T *src_new = (T *)src; \ + uint32_t dst_offset = is_scatter ? 0 : data_num_new; \ + uint32_t src_offset = is_scatter ? data_num_new : 0; \ + uint8_t *mask_new = (uint8_t *)mask; \ + \ + for (uint32_t i = 0; i < repeat; ++i) { \ + if (mask) { \ + __##func(dst_new, src_new, mask_new, offset_new, transfer_size, dir, \ + stride, data_num_new); \ + mask_new += data_num_new / 8; /*1 byte = 8 bit */ \ + } else { \ + __##func(dst_new, src_new, offset_new, transfer_size, dir, stride, \ + data_num_new); \ + } \ + offset_new += data_num_new; \ + dst_new += dst_offset; \ + src_new += src_offset; \ + } \ + if (remain > 0) { \ + __##func(dst_new, src_new, offset_new, transfer_size, dir, stride, \ + remain); \ + } \ + } \ + } + +// __mlu_op_scatter +// __mlu_op_scatter_async +// __mlu_op_gather +// __mlu_op_gather_async +MLUOP_SCATTER_GATHER(gather_async, false) +MLUOP_SCATTER_GATHER(gather, false) +MLUOP_SCATTER_GATHER(scatter_async, true) +MLUOP_SCATTER_GATHER(scatter, true) + +#elif __BANG_ARCH__ == 592 +#define MLUOP_SCATTER_GATHER(func) \ + template \ + __mlu_func__ void __mluop##func(SCATTER_GATHER_PARAMS) { \ + __##func(SCATTER_GATHER_PARAMS); \ + } + +MLUOP_SCATTER_GATHER(gather_async) +MLUOP_SCATTER_GATHER(gather) +MLUOP_SCATTER_GATHER(scatter_async) +MLUOP_SCATTER_GATHER(scatter) + +#endif // __BANG_ARCH__ > 592 diff --git a/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu b/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu index 90ecc8363..9eabbb6f1 100644 --- a/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu +++ b/kernels/voxel_pooling_forward/voxel_pooling_forward_union1.mlu @@ -25,6 +25,7 @@ #include "core/logging.h" #include "kernels/kernel.h" #include "kernels/utils/common.h" +#include "kernels/utils/scatter_gather.h" __nram__ int8_t nram_buffer[MAX_NRAM_SIZE]; @@ -392,10 +393,10 @@ __mlu_func__ void MLUKernelVoxelPoolingStageTwoPerfKernel( __bang_ge_bitindex((float *)gather_mask, (float *)nram_geom + point_idx_offset, (float *)nram_geom_x, align_8_deal_num); - __gather((float *)gather_src, (float *)input_features, - (unsigned int *)gather_offset + point_idx_offset, - (void *)gather_mask, num_channels * sizeof(float), GDRAM2NRAM, - num_channels * sizeof(float), actual_load_num); + __mluop_gather((float *)gather_src, (float *)input_features, + (unsigned int *)gather_offset + point_idx_offset, + (void *)gather_mask, num_channels * sizeof(float), GDRAM2NRAM, + num_channels * sizeof(float), actual_load_num); for (int index = 0; index < actual_load_num; index++) { int output_features_pt_offset = nram_geom[point_idx_offset + index]; if (output_features_pt_offset >= 0) {