Skip to content

Commit

Permalink
[Feature](mlu-ops): adapt scatter,gather
Browse files Browse the repository at this point in the history
  • Loading branch information
PetrelYy committed Dec 2, 2024
1 parent 4f97e87 commit a6572c4
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 17 deletions.
28 changes: 15 additions & 13 deletions kernels/box_iou_rotated/box_iou_rotated_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#define KERNELS_BOX_IOU_ROTATED_BOX_IOU_ROTATED_UTILS_H_

#include "kernels/utils/common.h"
#include "kernels/utils/scatter_gather.h"

#define FIILED_ONES (int)0xffffffff
#define HALF_FILLED_ONES (int16_t)0xffff
Expand Down Expand Up @@ -590,21 +591,22 @@ __mlu_func__ void convexHullGraham(
sizeof(T), actual_compute_box_num);

// get the ordered points according to the angle value
__gather(ordered_pts_x + (i + 1) * actual_compute_box_num, intersect_pts_x,
(unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T),
actual_compute_box_num);
__gather(ordered_pts_y + (i + 1) * actual_compute_box_num, intersect_pts_y,
(unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T),
actual_compute_box_num);
__gather(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts,
(unsigned int *)temp_offset, sizeof(T), NRAM2NRAM, sizeof(T),
actual_compute_box_num);
__mluop_gather<T>(ordered_pts_x + (i + 1) * actual_compute_box_num,
intersect_pts_x, (unsigned int *)temp_offset, NULL,
sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
__mluop_gather<T>(ordered_pts_y + (i + 1) * actual_compute_box_num,
intersect_pts_y, (unsigned int *)temp_offset, NULL,
sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
__mluop_gather<T>(temp_long_1 + (i + 1) * actual_compute_box_num, valid_pts,
(unsigned int *)temp_offset, NULL, sizeof(T), NRAM2NRAM,
sizeof(T), actual_compute_box_num);

// assign a invalid value to the point which has been get ordered
__scatter(temp_long_2, temp1_ram, (unsigned int *)temp_offset, sizeof(T),
NRAM2NRAM, sizeof(T), actual_compute_box_num);
__scatter(valid_pts, temp2_ram, (unsigned int *)temp_offset, sizeof(T),
NRAM2NRAM, sizeof(T), actual_compute_box_num);
__mluop_scatter<T>(temp_long_2, temp1_ram, (unsigned int *)temp_offset,
NULL, sizeof(T), NRAM2NRAM, sizeof(T),
actual_compute_box_num);
__mluop_scatter<T>(valid_pts, temp2_ram, (unsigned int *)temp_offset, NULL,
sizeof(T), NRAM2NRAM, sizeof(T), actual_compute_box_num);
}
__bang_move(valid_pts, temp_long_1, total_points * sizeof(T));
#else
Expand Down
89 changes: 89 additions & 0 deletions kernels/utils/scatter_gather.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*************************************************************************
* Copyright (C) [2024] by Cambricon, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/

#include "kernels/kernel.h"

#define SCATTER_GATHER_PARAMS \
void *dst, void *src, uint32_t *offset, void *mask, \
const uint32_t transfer_size, const mluMemcpyDirection_t dir, \
const uint32_t stride, const uint32_t data_num

#if __BANG_ARCH__ > 592
#define MLUOP_SCATTER_GATHER(func, is_scatter) \
template <typename T> \
__mlu_func__ void __mluop_##func(SCATTER_GATHER_PARAMS) { \
if (data_num <= UINT16_MAX) { \
__##func(dst, src, offset, transfer_size, dir, stride, data_num); \
} else { \
uint16_t data_num_new = PAD_DOWN(UINT16_MAX, 64); \
uint32_t repeat = data_num / data_num_new; \
uint32_t remain = data_num % data_num_new; \
uint32_t *offset_new = offset; \
T *dst_new = (T *)dst; \
T *src_new = (T *)src; \
uint32_t dst_offset = is_scatter ? 0 : data_num_new; \
uint32_t src_offset = is_scatter ? data_num_new : 0; \
uint8_t *mask_new = (uint8_t *)mask; \
\
for (uint32_t i = 0; i < repeat; ++i) { \
if (mask) { \
__##func(dst_new, src_new, mask_new, offset_new, transfer_size, dir, \
stride, data_num_new); \
mask_new += data_num_new / 8; /*1 byte = 8 bit */ \
} else { \
__##func(dst_new, src_new, offset_new, transfer_size, dir, stride, \
data_num_new); \
} \
offset_new += data_num_new; \
dst_new += dst_offset; \
src_new += src_offset; \
} \
if (remain > 0) { \
__##func(dst_new, src_new, offset_new, transfer_size, dir, stride, \
remain); \
} \
} \
}

// __mlu_op_scatter
// __mlu_op_scatter_async
// __mlu_op_gather
// __mlu_op_gather_async
MLUOP_SCATTER_GATHER(gather_async, false)
MLUOP_SCATTER_GATHER(gather, false)
MLUOP_SCATTER_GATHER(scatter_async, true)
MLUOP_SCATTER_GATHER(scatter, true)

#elif __BANG_ARCH__ == 592
#define MLUOP_SCATTER_GATHER(func) \
template <typename T> \
__mlu_func__ void __mluop##func(SCATTER_GATHER_PARAMS) { \
__##func(SCATTER_GATHER_PARAMS); \
}

MLUOP_SCATTER_GATHER(gather_async)
MLUOP_SCATTER_GATHER(gather)
MLUOP_SCATTER_GATHER(scatter_async)
MLUOP_SCATTER_GATHER(scatter)

#endif // __BANG_ARCH__ > 592
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "core/logging.h"
#include "kernels/kernel.h"
#include "kernels/utils/common.h"
#include "kernels/utils/scatter_gather.h"

__nram__ int8_t nram_buffer[MAX_NRAM_SIZE];

Expand Down Expand Up @@ -392,10 +393,10 @@ __mlu_func__ void MLUKernelVoxelPoolingStageTwoPerfKernel(
__bang_ge_bitindex((float *)gather_mask,
(float *)nram_geom + point_idx_offset,
(float *)nram_geom_x, align_8_deal_num);
__gather((float *)gather_src, (float *)input_features,
(unsigned int *)gather_offset + point_idx_offset,
(void *)gather_mask, num_channels * sizeof(float), GDRAM2NRAM,
num_channels * sizeof(float), actual_load_num);
__mluop_gather<float>((float *)gather_src, (float *)input_features,
(unsigned int *)gather_offset + point_idx_offset,
(void *)gather_mask, num_channels * sizeof(float), GDRAM2NRAM,
num_channels * sizeof(float), actual_load_num);
for (int index = 0; index < actual_load_num; index++) {
int output_features_pt_offset = nram_geom[point_idx_offset + index];
if (output_features_pt_offset >= 0) {
Expand Down

0 comments on commit a6572c4

Please sign in to comment.